crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -27,9 +29,12 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
27
29
  from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
28
30
  from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
29
31
  from crawlee._request import Request, RequestOptions, RequestState
32
+ from crawlee._service_locator import ServiceLocator
30
33
  from crawlee._types import (
31
34
  BasicCrawlingContext,
32
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
33
38
  GetKeyValueStoreFromRequestHandlerFunction,
34
39
  HttpHeaders,
35
40
  HttpPayload,
@@ -39,7 +44,7 @@ from crawlee._types import (
39
44
  SkippedReason,
40
45
  )
41
46
  from crawlee._utils.docs import docs_group
42
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
43
48
  from crawlee._utils.recurring_task import RecurringTask
44
49
  from crawlee._utils.robots import RobotsTxtFile
45
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -54,6 +59,7 @@ from crawlee.errors import (
54
59
  RequestHandlerError,
55
60
  SessionError,
56
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
57
63
  )
58
64
  from crawlee.events._types import Event, EventCrawlerStatusData
59
65
  from crawlee.http_clients import ImpitHttpClient
@@ -63,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
63
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
64
70
 
65
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swapped_context
66
73
  from ._logging_utils import (
67
74
  get_one_line_error_summary_if_possible,
68
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -95,6 +102,9 @@ if TYPE_CHECKING:
95
102
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
96
103
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
97
104
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
105
+ TParams = ParamSpec('TParams')
106
+ T = TypeVar('T')
107
+
98
108
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
99
109
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
100
110
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -204,7 +214,7 @@ class _BasicCrawlerOptions(TypedDict):
204
214
  Returning `None` suppresses the status message."""
205
215
 
206
216
 
207
- class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
217
+ class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
208
218
  """Generic options the `BasicCrawler` constructor."""
209
219
 
210
220
  request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
@@ -219,9 +229,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
219
229
 
220
230
 
221
231
  class BasicCrawlerOptions(
222
- Generic[TCrawlingContext, TStatisticsState],
223
232
  _BasicCrawlerOptions,
224
233
  _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
234
+ Generic[TCrawlingContext, TStatisticsState],
225
235
  ):
226
236
  """Arguments for the `BasicCrawler` constructor.
227
237
 
@@ -346,14 +356,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
346
356
  _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
347
357
  Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
348
358
  """
349
- if configuration:
350
- service_locator.set_configuration(configuration)
351
- if storage_client:
352
- service_locator.set_storage_client(storage_client)
353
- if event_manager:
354
- service_locator.set_event_manager(event_manager)
359
+ implicit_event_manager_with_explicit_config = False
360
+ if not configuration:
361
+ configuration = service_locator.get_configuration()
362
+ elif not event_manager:
363
+ implicit_event_manager_with_explicit_config = True
355
364
 
356
- config = service_locator.get_configuration()
365
+ if not storage_client:
366
+ storage_client = service_locator.get_storage_client()
367
+
368
+ if not event_manager:
369
+ event_manager = service_locator.get_event_manager()
370
+
371
+ self._service_locator = ServiceLocator(
372
+ configuration=configuration, storage_client=storage_client, event_manager=event_manager
373
+ )
374
+
375
+ config = self._service_locator.get_configuration()
357
376
 
358
377
  # Core components
359
378
  self._request_manager = request_manager
@@ -391,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
391
410
  self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
392
411
 
393
412
  # Context pipeline
394
- self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
413
+ self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
395
414
 
396
415
  # Crawl settings
397
416
  self._max_request_retries = max_request_retries
@@ -419,17 +438,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
419
438
  httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
420
439
  httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
421
440
  self._logger = _logger or logging.getLogger(__name__)
441
+ if implicit_event_manager_with_explicit_config:
442
+ self._logger.warning(
443
+ 'No event manager set, implicitly using event manager from global service_locator.'
444
+ 'It is advised to explicitly set the event manager if explicit configuration is used as well.'
445
+ )
422
446
  self._statistics_log_format = statistics_log_format
423
447
 
424
448
  # Statistics
425
- self._statistics = statistics or cast(
426
- 'Statistics[TStatisticsState]',
427
- Statistics.with_default_state(
428
- periodic_message_logger=self._logger,
429
- statistics_log_format=self._statistics_log_format,
430
- log_message='Current request statistics:',
431
- ),
432
- )
449
+ if statistics:
450
+ self._statistics = statistics
451
+ else:
452
+
453
+ async def persist_state_factory() -> KeyValueStore:
454
+ return await self.get_key_value_store()
455
+
456
+ self._statistics = cast(
457
+ 'Statistics[TStatisticsState]',
458
+ Statistics.with_default_state(
459
+ persistence_enabled=True,
460
+ periodic_message_logger=self._logger,
461
+ statistics_log_format=self._statistics_log_format,
462
+ log_message='Current request statistics:',
463
+ persist_state_kvs_factory=persist_state_factory,
464
+ ),
465
+ )
433
466
 
434
467
  # Additional context managers to enter and exit
435
468
  self._additional_context_managers = _additional_context_managers or []
@@ -496,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
496
529
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
497
530
  self._unexpected_stop = True
498
531
 
532
+ def _wrap_handler_with_error_context(
533
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
534
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
535
+ """Decorate error handlers to make their context helpers usable."""
536
+
537
+ @functools.wraps(handler)
538
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
539
+ # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
540
+ # failed. Modified context provides context helpers with direct access to the storages.
541
+ error_context = context.create_modified_copy(
542
+ push_data=self._push_data,
543
+ get_key_value_store=self.get_key_value_store,
544
+ add_requests=functools.partial(self._add_requests, context),
545
+ )
546
+ return await handler(error_context, exception)
547
+
548
+ return wrapped_handler
549
+
499
550
  def _stop_if_max_requests_count_exceeded(self) -> None:
500
551
  """Call `stop` when the maximum number of requests to crawl has been reached."""
501
552
  if self._max_requests_per_crawl is None:
@@ -548,7 +599,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
548
599
  async def get_request_manager(self) -> RequestManager:
549
600
  """Return the configured request manager. If none is configured, open and return the default request queue."""
550
601
  if not self._request_manager:
551
- self._request_manager = await RequestQueue.open()
602
+ self._request_manager = await RequestQueue.open(
603
+ storage_client=self._service_locator.get_storage_client(),
604
+ configuration=self._service_locator.get_configuration(),
605
+ )
552
606
 
553
607
  return self._request_manager
554
608
 
@@ -557,18 +611,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
557
611
  *,
558
612
  id: str | None = None,
559
613
  name: str | None = None,
614
+ alias: str | None = None,
560
615
  ) -> Dataset:
561
616
  """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
562
- return await Dataset.open(id=id, name=name)
617
+ return await Dataset.open(
618
+ id=id,
619
+ name=name,
620
+ alias=alias,
621
+ storage_client=self._service_locator.get_storage_client(),
622
+ configuration=self._service_locator.get_configuration(),
623
+ )
563
624
 
564
625
  async def get_key_value_store(
565
626
  self,
566
627
  *,
567
628
  id: str | None = None,
568
629
  name: str | None = None,
630
+ alias: str | None = None,
569
631
  ) -> KeyValueStore:
570
632
  """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
571
- return await KeyValueStore.open(id=id, name=name)
633
+ return await KeyValueStore.open(
634
+ id=id,
635
+ name=name,
636
+ alias=alias,
637
+ storage_client=self._service_locator.get_storage_client(),
638
+ configuration=self._service_locator.get_configuration(),
639
+ )
572
640
 
573
641
  def error_handler(
574
642
  self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -577,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
577
645
 
578
646
  The error handler is invoked after a request handler error occurs and before a retry attempt.
579
647
  """
580
- self._error_handler = handler
648
+ self._error_handler = self._wrap_handler_with_error_context(handler)
581
649
  return handler
582
650
 
583
651
  def failed_request_handler(
@@ -587,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
587
655
 
588
656
  The failed request handler is invoked when a request has failed all retry attempts.
589
657
  """
590
- self._failed_request_handler = handler
658
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
591
659
  return handler
592
660
 
593
661
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -627,7 +695,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
627
695
  request_manager = await self.get_request_manager()
628
696
  if purge_request_queue and isinstance(request_manager, RequestQueue):
629
697
  await request_manager.drop()
630
- self._request_manager = await RequestQueue.open()
698
+ self._request_manager = await RequestQueue.open(
699
+ storage_client=self._service_locator.get_storage_client(),
700
+ configuration=self._service_locator.get_configuration(),
701
+ )
631
702
 
632
703
  if requests is not None:
633
704
  await self.add_requests(requests)
@@ -654,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
654
725
  except CancelledError:
655
726
  pass
656
727
  finally:
657
- await self._crawler_state_rec_task.stop()
658
728
  if threading.current_thread() is threading.main_thread():
659
729
  with suppress(NotImplementedError):
660
730
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -684,9 +754,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
684
754
  return final_statistics
685
755
 
686
756
  async def _run_crawler(self) -> None:
687
- event_manager = service_locator.get_event_manager()
688
-
689
- self._crawler_state_rec_task.start()
757
+ event_manager = self._service_locator.get_event_manager()
690
758
 
691
759
  # Collect the context managers to be entered. Context managers that are already active are excluded,
692
760
  # as they were likely entered by the caller, who will also be responsible for exiting them.
@@ -698,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
698
766
  self._statistics,
699
767
  self._session_pool if self._use_session_pool else None,
700
768
  self._http_client,
769
+ self._crawler_state_rec_task,
701
770
  *self._additional_context_managers,
702
771
  )
703
772
  if cm and getattr(cm, 'active', False) is False
@@ -705,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
705
774
 
706
775
  async with AsyncExitStack() as exit_stack:
707
776
  for context in contexts_to_enter:
708
- await exit_stack.enter_async_context(context) # type: ignore[arg-type]
777
+ await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
709
778
 
710
779
  await self._autoscaled_pool.run()
711
780
 
@@ -772,6 +841,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
772
841
  self,
773
842
  dataset_id: str | None = None,
774
843
  dataset_name: str | None = None,
844
+ dataset_alias: str | None = None,
775
845
  **kwargs: Unpack[GetDataKwargs],
776
846
  ) -> DatasetItemsListPage:
777
847
  """Retrieve data from a `Dataset`.
@@ -781,13 +851,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
781
851
 
782
852
  Args:
783
853
  dataset_id: The ID of the `Dataset`.
784
- dataset_name: The name of the `Dataset`.
854
+ dataset_name: The name of the `Dataset` (global scope, named storage).
855
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
785
856
  kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
786
857
 
787
858
  Returns:
788
859
  The retrieved data.
789
860
  """
790
- dataset = await Dataset.open(id=dataset_id, name=dataset_name)
861
+ dataset = await Dataset.open(
862
+ id=dataset_id,
863
+ name=dataset_name,
864
+ alias=dataset_alias,
865
+ storage_client=self._service_locator.get_storage_client(),
866
+ configuration=self._service_locator.get_configuration(),
867
+ )
791
868
  return await dataset.get_data(**kwargs)
792
869
 
793
870
  async def export_data(
@@ -795,6 +872,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
795
872
  path: str | Path,
796
873
  dataset_id: str | None = None,
797
874
  dataset_name: str | None = None,
875
+ dataset_alias: str | None = None,
876
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
798
877
  ) -> None:
799
878
  """Export all items from a Dataset to a JSON or CSV file.
800
879
 
@@ -804,18 +883,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
804
883
 
805
884
  Args:
806
885
  path: The destination file path. Must end with '.json' or '.csv'.
807
- dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
808
- dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
886
+ dataset_id: The ID of the Dataset to export from.
887
+ dataset_name: The name of the Dataset to export from (global scope, named storage).
888
+ dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
889
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
809
890
  """
810
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
891
+ dataset = await Dataset.open(
892
+ id=dataset_id,
893
+ name=dataset_name,
894
+ alias=dataset_alias,
895
+ storage_client=self._service_locator.get_storage_client(),
896
+ configuration=self._service_locator.get_configuration(),
897
+ )
811
898
 
812
- path = path if isinstance(path, Path) else Path(path)
813
- dst = path.open('w', newline='')
899
+ path = Path(path)
814
900
 
815
901
  if path.suffix == '.csv':
816
- await export_csv_to_stream(dataset.iterate_items(), dst)
902
+ dst = StringIO()
903
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
904
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
905
+ await atomic_write(path, dst.getvalue())
817
906
  elif path.suffix == '.json':
818
- await export_json_to_stream(dataset.iterate_items(), dst)
907
+ dst = StringIO()
908
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
909
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
910
+ await atomic_write(path, dst.getvalue())
819
911
  else:
820
912
  raise ValueError(f'Unsupported file extension: {path.suffix}')
821
913
 
@@ -824,6 +916,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
824
916
  data: list[dict[str, Any]] | dict[str, Any],
825
917
  dataset_id: str | None = None,
826
918
  dataset_name: str | None = None,
919
+ dataset_alias: str | None = None,
827
920
  **kwargs: Unpack[PushDataKwargs],
828
921
  ) -> None:
829
922
  """Push data to a `Dataset`.
@@ -834,10 +927,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
834
927
  Args:
835
928
  data: The data to push to the `Dataset`.
836
929
  dataset_id: The ID of the `Dataset`.
837
- dataset_name: The name of the `Dataset`.
930
+ dataset_name: The name of the `Dataset` (global scope, named storage).
931
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
838
932
  kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
839
933
  """
840
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
934
+ dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
841
935
  await dataset.push_data(data, **kwargs)
842
936
 
843
937
  def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -894,6 +988,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
894
988
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
895
989
  | None = None,
896
990
  requests: Sequence[str | Request] | None = None,
991
+ rq_id: str | None = None,
992
+ rq_name: str | None = None,
993
+ rq_alias: str | None = None,
897
994
  **kwargs: Unpack[EnqueueLinksKwargs],
898
995
  ) -> None:
899
996
  kwargs.setdefault('strategy', 'same-hostname')
@@ -905,7 +1002,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
905
1002
  '`transform_request_function` arguments when `requests` is provided.'
906
1003
  )
907
1004
  # Add directly passed requests.
908
- await context.add_requests(requests or list[str | Request](), **kwargs)
1005
+ await context.add_requests(
1006
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
1007
+ )
909
1008
  else:
910
1009
  # Add requests from extracted links.
911
1010
  await context.add_requests(
@@ -914,7 +1013,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
914
1013
  label=label,
915
1014
  user_data=user_data,
916
1015
  transform_request_function=transform_request_function,
1016
+ **kwargs,
917
1017
  ),
1018
+ rq_id=rq_id,
1019
+ rq_name=rq_name,
1020
+ rq_alias=rq_alias,
918
1021
  **kwargs,
919
1022
  )
920
1023
 
@@ -936,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
936
1039
  warning_flag = True
937
1040
 
938
1041
  for request in request_iterator:
939
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
940
1048
  parsed_target_url = urlparse(target_url)
941
1049
 
942
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -948,9 +1056,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
948
1056
  ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
949
1057
  yield request
950
1058
 
951
- limit = limit - 1 if limit is not None else None
952
- if limit and limit <= 0:
953
- break
1059
+ if limit is not None:
1060
+ limit -= 1
1061
+ if limit <= 0:
1062
+ break
954
1063
 
955
1064
  def _check_enqueue_strategy(
956
1065
  self,
@@ -974,8 +1083,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
974
1083
  return target_url.hostname == origin_url.hostname
975
1084
 
976
1085
  if strategy == 'same-domain':
977
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
978
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1086
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1087
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
979
1088
  return origin_domain == target_domain
980
1089
 
981
1090
  if strategy == 'same-origin':
@@ -1031,8 +1140,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1031
1140
 
1032
1141
  if self._should_retry_request(context, error):
1033
1142
  request.retry_count += 1
1143
+ reduced_error = str(error).split('\n')[0]
1034
1144
  self.log.warning(
1035
- f'Retrying request to {context.request.url} due to: {error} \n'
1145
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1036
1146
  f'{get_one_line_error_summary_if_possible(error)}'
1037
1147
  )
1038
1148
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1043,21 +1153,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1043
1153
  except Exception as e:
1044
1154
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1045
1155
  else:
1046
- if new_request is not None:
1047
- request = new_request
1156
+ if new_request is not None and new_request != request:
1157
+ await request_manager.add_request(new_request)
1158
+ await self._mark_request_as_handled(request)
1159
+ return
1048
1160
 
1049
1161
  await request_manager.reclaim_request(request)
1050
1162
  else:
1051
- await wait_for(
1052
- lambda: request_manager.mark_request_as_handled(context.request),
1053
- timeout=self._internal_timeout,
1054
- timeout_message='Marking request as handled timed out after '
1055
- f'{self._internal_timeout.total_seconds()} seconds',
1056
- logger=self._logger,
1057
- max_retries=3,
1058
- )
1163
+ request.state = RequestState.ERROR
1164
+ await self._mark_request_as_handled(request)
1059
1165
  await self._handle_failed_request(context, error)
1060
- self._statistics.record_request_processing_failure(request.id or request.unique_key)
1166
+ self._statistics.record_request_processing_failure(request.unique_key)
1061
1167
 
1062
1168
  async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
1063
1169
  try:
@@ -1070,8 +1176,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1070
1176
  f'{self._internal_timeout.total_seconds()} seconds',
1071
1177
  logger=self._logger,
1072
1178
  )
1073
-
1074
- context.request.state = RequestState.DONE
1075
1179
  except UserDefinedErrorHandlerError:
1076
1180
  context.request.state = RequestState.ERROR
1077
1181
  raise
@@ -1104,17 +1208,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1104
1208
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1105
1209
  ) -> None:
1106
1210
  if need_mark and isinstance(request, Request):
1107
- request_manager = await self.get_request_manager()
1108
-
1109
- await wait_for(
1110
- lambda: request_manager.mark_request_as_handled(request),
1111
- timeout=self._internal_timeout,
1112
- timeout_message='Marking request as handled timed out after '
1113
- f'{self._internal_timeout.total_seconds()} seconds',
1114
- logger=self._logger,
1115
- max_retries=3,
1116
- )
1117
1211
  request.state = RequestState.SKIPPED
1212
+ await self._mark_request_as_handled(request)
1118
1213
 
1119
1214
  url = request.url if isinstance(request, Request) else request
1120
1215
 
@@ -1134,10 +1229,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1134
1229
 
1135
1230
  if (
1136
1231
  isinstance(error, asyncio.exceptions.TimeoutError)
1232
+ and traceback_parts
1137
1233
  and self._request_handler_timeout_text in traceback_parts[-1]
1138
- ):
1234
+ ) or isinstance(error, UserHandlerTimeoutError):
1139
1235
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1140
- used_traceback_parts.append(traceback_parts[-1])
1236
+ used_traceback_parts.extend(traceback_parts[-1:])
1141
1237
 
1142
1238
  return ''.join(used_traceback_parts).strip('\n')
1143
1239
 
@@ -1186,47 +1282,61 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1186
1282
  else:
1187
1283
  yield Request.from_url(url)
1188
1284
 
1189
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1190
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1191
- result = self._context_result_map[context]
1192
-
1193
- request_manager = await self.get_request_manager()
1194
- origin = context.request.loaded_url or context.request.url
1195
-
1196
- for add_requests_call in result.add_requests_calls:
1197
- requests = list[Request]()
1198
-
1199
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1200
-
1201
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1285
+ async def _add_requests(
1286
+ self,
1287
+ context: BasicCrawlingContext,
1288
+ requests: Sequence[str | Request],
1289
+ rq_id: str | None = None,
1290
+ rq_name: str | None = None,
1291
+ rq_alias: str | None = None,
1292
+ **kwargs: Unpack[EnqueueLinksKwargs],
1293
+ ) -> None:
1294
+ """Add requests method aware of the crawling context."""
1295
+ if rq_id or rq_name or rq_alias:
1296
+ request_manager: RequestManager = await RequestQueue.open(
1297
+ id=rq_id,
1298
+ name=rq_name,
1299
+ alias=rq_alias,
1300
+ storage_client=self._service_locator.get_storage_client(),
1301
+ configuration=self._service_locator.get_configuration(),
1302
+ )
1303
+ else:
1304
+ request_manager = await self.get_request_manager()
1202
1305
 
1203
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1306
+ context_aware_requests = list[Request]()
1307
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1308
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1309
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1310
+ for dst_request in filter_requests_iterator:
1311
+ # Update the crawl depth of the request.
1312
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1204
1313
 
1205
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1206
- requests_iterator, context.request.url, **enqueue_links_kwargs
1207
- )
1314
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1315
+ context_aware_requests.append(dst_request)
1208
1316
 
1209
- for dst_request in filter_requests_iterator:
1210
- # Update the crawl depth of the request.
1211
- dst_request.crawl_depth = context.request.crawl_depth + 1
1317
+ return await request_manager.add_requests(context_aware_requests)
1212
1318
 
1213
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1214
- requests.append(dst_request)
1319
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1320
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1321
+ result = self._context_result_map[context]
1215
1322
 
1216
- await request_manager.add_requests(requests)
1323
+ for add_requests_call in result.add_requests_calls:
1324
+ await self._add_requests(context, **add_requests_call)
1217
1325
 
1218
1326
  for push_data_call in result.push_data_calls:
1219
1327
  await self._push_data(**push_data_call)
1220
1328
 
1221
1329
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1222
1330
 
1331
+ result.apply_request_changes(target=context.request)
1332
+
1223
1333
  @staticmethod
1224
1334
  async def _commit_key_value_store_changes(
1225
1335
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
1226
1336
  ) -> None:
1227
1337
  """Store key value store changes recorded in result."""
1228
- for (id, name), changes in result.key_value_store_changes.items():
1229
- store = await get_kvs(id=id, name=name)
1338
+ for (id, name, alias), changes in result.key_value_store_changes.items():
1339
+ store = await get_kvs(id=id, name=name, alias=alias)
1230
1340
  for key, value in changes.updates.items():
1231
1341
  await store.set_value(key, value.content, value.content_type)
1232
1342
 
@@ -1274,7 +1384,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1274
1384
 
1275
1385
  if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
1276
1386
  self._logger.warning(
1277
- f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
1387
+ f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
1278
1388
  )
1279
1389
 
1280
1390
  await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
@@ -1285,10 +1395,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1285
1395
  else:
1286
1396
  session = await self._get_session()
1287
1397
  proxy_info = await self._get_proxy_info(request, session)
1288
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1398
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1289
1399
 
1290
1400
  context = BasicCrawlingContext(
1291
- request=request,
1401
+ request=result.request,
1292
1402
  session=session,
1293
1403
  proxy_info=proxy_info,
1294
1404
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1300,38 +1410,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1300
1410
  )
1301
1411
  self._context_result_map[context] = result
1302
1412
 
1303
- statistics_id = request.id or request.unique_key
1304
- self._statistics.record_request_processing_start(statistics_id)
1413
+ self._statistics.record_request_processing_start(request.unique_key)
1305
1414
 
1306
1415
  try:
1307
1416
  request.state = RequestState.REQUEST_HANDLER
1308
1417
 
1309
- self._check_request_collision(context.request, context.session)
1310
-
1311
1418
  try:
1312
- await self._run_request_handler(context=context)
1419
+ with swapped_context(context, request):
1420
+ self._check_request_collision(request, session)
1421
+ await self._run_request_handler(context=context)
1313
1422
  except asyncio.TimeoutError as e:
1314
1423
  raise RequestHandlerError(e, context) from e
1315
1424
 
1316
1425
  await self._commit_request_handler_result(context)
1317
- await wait_for(
1318
- lambda: request_manager.mark_request_as_handled(context.request),
1319
- timeout=self._internal_timeout,
1320
- timeout_message='Marking request as handled timed out after '
1321
- f'{self._internal_timeout.total_seconds()} seconds',
1322
- logger=self._logger,
1323
- max_retries=3,
1324
- )
1325
1426
 
1326
1427
  request.state = RequestState.DONE
1327
1428
 
1328
- if context.session and context.session.is_usable:
1329
- context.session.mark_good()
1429
+ await self._mark_request_as_handled(request)
1330
1430
 
1331
- self._statistics.record_request_processing_finish(statistics_id)
1431
+ if session and session.is_usable:
1432
+ session.mark_good()
1433
+
1434
+ self._statistics.record_request_processing_finish(request.unique_key)
1332
1435
 
1333
1436
  except RequestCollisionError as request_error:
1334
- context.request.no_retry = True
1437
+ request.no_retry = True
1335
1438
  await self._handle_request_error(context, request_error)
1336
1439
 
1337
1440
  except RequestHandlerError as primary_error:
@@ -1346,46 +1449,34 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1346
1449
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1347
1450
 
1348
1451
  except SessionError as session_error:
1349
- if not context.session:
1452
+ if not session:
1350
1453
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1351
1454
 
1352
1455
  if self._error_handler:
1353
1456
  await self._error_handler(context, session_error)
1354
1457
 
1355
1458
  if self._should_retry_request(context, session_error):
1356
- self._logger.warning('Encountered a session error, rotating session and retrying')
1459
+ exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1460
+ self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1357
1461
 
1358
- context.session.retire()
1462
+ if session:
1463
+ session.retire()
1359
1464
 
1360
1465
  # Increment session rotation count.
1361
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1466
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1362
1467
 
1363
1468
  await request_manager.reclaim_request(request)
1364
1469
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1365
1470
  else:
1366
- await wait_for(
1367
- lambda: request_manager.mark_request_as_handled(context.request),
1368
- timeout=self._internal_timeout,
1369
- timeout_message='Marking request as handled timed out after '
1370
- f'{self._internal_timeout.total_seconds()} seconds',
1371
- logger=self._logger,
1372
- max_retries=3,
1373
- )
1471
+ await self._mark_request_as_handled(request)
1374
1472
 
1375
1473
  await self._handle_failed_request(context, session_error)
1376
- self._statistics.record_request_processing_failure(statistics_id)
1474
+ self._statistics.record_request_processing_failure(request.unique_key)
1377
1475
 
1378
1476
  except ContextPipelineInterruptedError as interrupted_error:
1379
1477
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1380
1478
 
1381
- await wait_for(
1382
- lambda: request_manager.mark_request_as_handled(context.request),
1383
- timeout=self._internal_timeout,
1384
- timeout_message='Marking request as handled timed out after '
1385
- f'{self._internal_timeout.total_seconds()} seconds',
1386
- logger=self._logger,
1387
- max_retries=3,
1388
- )
1479
+ await self._mark_request_as_handled(request)
1389
1480
 
1390
1481
  except ContextPipelineInitializationError as initialization_error:
1391
1482
  self._logger.debug(
@@ -1403,12 +1494,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1403
1494
  raise
1404
1495
 
1405
1496
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1406
- await wait_for(
1407
- lambda: self._context_pipeline(context, self.router),
1408
- timeout=self._request_handler_timeout,
1409
- timeout_message=f'{self._request_handler_timeout_text}'
1410
- f' {self._request_handler_timeout.total_seconds()} seconds',
1411
- logger=self._logger,
1497
+ context.request.state = RequestState.BEFORE_NAV
1498
+ await self._context_pipeline(
1499
+ context,
1500
+ lambda final_context: wait_for(
1501
+ lambda: self.router(final_context),
1502
+ timeout=self._request_handler_timeout,
1503
+ timeout_message=f'{self._request_handler_timeout_text}'
1504
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1505
+ logger=self._logger,
1506
+ ),
1412
1507
  )
1413
1508
 
1414
1509
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1519,7 +1614,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1519
1614
 
1520
1615
  async def _crawler_state_task(self) -> None:
1521
1616
  """Emit a persist state event with the given migration status."""
1522
- event_manager = service_locator.get_event_manager()
1617
+ event_manager = self._service_locator.get_event_manager()
1523
1618
 
1524
1619
  current_state = self.statistics.state
1525
1620
 
@@ -1556,3 +1651,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1556
1651
  )
1557
1652
 
1558
1653
  self._previous_crawler_state = current_state
1654
+
1655
+ async def _mark_request_as_handled(self, request: Request) -> None:
1656
+ request_manager = await self.get_request_manager()
1657
+ await wait_for(
1658
+ lambda: request_manager.mark_request_as_handled(request),
1659
+ timeout=self._internal_timeout,
1660
+ timeout_message='Marking request as handled timed out after '
1661
+ f'{self._internal_timeout.total_seconds()} seconds',
1662
+ logger=self._logger,
1663
+ max_retries=3,
1664
+ )