crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -27,9 +29,12 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
27
29
  from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
28
30
  from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
29
31
  from crawlee._request import Request, RequestOptions, RequestState
32
+ from crawlee._service_locator import ServiceLocator
30
33
  from crawlee._types import (
31
34
  BasicCrawlingContext,
32
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
33
38
  GetKeyValueStoreFromRequestHandlerFunction,
34
39
  HttpHeaders,
35
40
  HttpPayload,
@@ -39,7 +44,7 @@ from crawlee._types import (
39
44
  SkippedReason,
40
45
  )
41
46
  from crawlee._utils.docs import docs_group
42
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
43
48
  from crawlee._utils.recurring_task import RecurringTask
44
49
  from crawlee._utils.robots import RobotsTxtFile
45
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -95,6 +100,9 @@ if TYPE_CHECKING:
95
100
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
96
101
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
97
102
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
103
+ TParams = ParamSpec('TParams')
104
+ T = TypeVar('T')
105
+
98
106
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
99
107
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
100
108
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -204,7 +212,7 @@ class _BasicCrawlerOptions(TypedDict):
204
212
  Returning `None` suppresses the status message."""
205
213
 
206
214
 
207
- class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
215
+ class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
208
216
  """Generic options the `BasicCrawler` constructor."""
209
217
 
210
218
  request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
@@ -219,9 +227,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
219
227
 
220
228
 
221
229
  class BasicCrawlerOptions(
222
- Generic[TCrawlingContext, TStatisticsState],
223
230
  _BasicCrawlerOptions,
224
231
  _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
232
+ Generic[TCrawlingContext, TStatisticsState],
225
233
  ):
226
234
  """Arguments for the `BasicCrawler` constructor.
227
235
 
@@ -346,14 +354,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
346
354
  _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
347
355
  Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
348
356
  """
349
- if configuration:
350
- service_locator.set_configuration(configuration)
351
- if storage_client:
352
- service_locator.set_storage_client(storage_client)
353
- if event_manager:
354
- service_locator.set_event_manager(event_manager)
357
+ implicit_event_manager_with_explicit_config = False
358
+ if not configuration:
359
+ configuration = service_locator.get_configuration()
360
+ elif not event_manager:
361
+ implicit_event_manager_with_explicit_config = True
355
362
 
356
- config = service_locator.get_configuration()
363
+ if not storage_client:
364
+ storage_client = service_locator.get_storage_client()
365
+
366
+ if not event_manager:
367
+ event_manager = service_locator.get_event_manager()
368
+
369
+ self._service_locator = ServiceLocator(
370
+ configuration=configuration, storage_client=storage_client, event_manager=event_manager
371
+ )
372
+
373
+ config = self._service_locator.get_configuration()
357
374
 
358
375
  # Core components
359
376
  self._request_manager = request_manager
@@ -419,17 +436,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
419
436
  httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
420
437
  httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
421
438
  self._logger = _logger or logging.getLogger(__name__)
439
+ if implicit_event_manager_with_explicit_config:
440
+ self._logger.warning(
441
+ 'No event manager set, implicitly using event manager from global service_locator.'
442
+ 'It is advised to explicitly set the event manager if explicit configuration is used as well.'
443
+ )
422
444
  self._statistics_log_format = statistics_log_format
423
445
 
424
446
  # Statistics
425
- self._statistics = statistics or cast(
426
- 'Statistics[TStatisticsState]',
427
- Statistics.with_default_state(
428
- periodic_message_logger=self._logger,
429
- statistics_log_format=self._statistics_log_format,
430
- log_message='Current request statistics:',
431
- ),
432
- )
447
+ if statistics:
448
+ self._statistics = statistics
449
+ else:
450
+
451
+ async def persist_state_factory() -> KeyValueStore:
452
+ return await self.get_key_value_store()
453
+
454
+ self._statistics = cast(
455
+ 'Statistics[TStatisticsState]',
456
+ Statistics.with_default_state(
457
+ persistence_enabled=True,
458
+ periodic_message_logger=self._logger,
459
+ statistics_log_format=self._statistics_log_format,
460
+ log_message='Current request statistics:',
461
+ persist_state_kvs_factory=persist_state_factory,
462
+ ),
463
+ )
433
464
 
434
465
  # Additional context managers to enter and exit
435
466
  self._additional_context_managers = _additional_context_managers or []
@@ -496,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
496
527
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
497
528
  self._unexpected_stop = True
498
529
 
530
+ def _wrap_handler_with_error_context(
531
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
532
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
533
+ """Decorate error handlers to make their context helpers usable."""
534
+
535
+ @functools.wraps(handler)
536
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
537
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
538
+ # failed. Modified context provides context helpers with direct access to the storages.
539
+ error_context = context.create_modified_copy(
540
+ push_data=self._push_data,
541
+ get_key_value_store=self.get_key_value_store,
542
+ add_requests=functools.partial(self._add_requests, context),
543
+ )
544
+ return await handler(error_context, exception)
545
+
546
+ return wrapped_handler
547
+
499
548
  def _stop_if_max_requests_count_exceeded(self) -> None:
500
549
  """Call `stop` when the maximum number of requests to crawl has been reached."""
501
550
  if self._max_requests_per_crawl is None:
@@ -548,7 +597,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
548
597
  async def get_request_manager(self) -> RequestManager:
549
598
  """Return the configured request manager. If none is configured, open and return the default request queue."""
550
599
  if not self._request_manager:
551
- self._request_manager = await RequestQueue.open()
600
+ self._request_manager = await RequestQueue.open(
601
+ storage_client=self._service_locator.get_storage_client(),
602
+ configuration=self._service_locator.get_configuration(),
603
+ )
552
604
 
553
605
  return self._request_manager
554
606
 
@@ -557,18 +609,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
557
609
  *,
558
610
  id: str | None = None,
559
611
  name: str | None = None,
612
+ alias: str | None = None,
560
613
  ) -> Dataset:
561
614
  """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
562
- return await Dataset.open(id=id, name=name)
615
+ return await Dataset.open(
616
+ id=id,
617
+ name=name,
618
+ alias=alias,
619
+ storage_client=self._service_locator.get_storage_client(),
620
+ configuration=self._service_locator.get_configuration(),
621
+ )
563
622
 
564
623
  async def get_key_value_store(
565
624
  self,
566
625
  *,
567
626
  id: str | None = None,
568
627
  name: str | None = None,
628
+ alias: str | None = None,
569
629
  ) -> KeyValueStore:
570
630
  """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
571
- return await KeyValueStore.open(id=id, name=name)
631
+ return await KeyValueStore.open(
632
+ id=id,
633
+ name=name,
634
+ alias=alias,
635
+ storage_client=self._service_locator.get_storage_client(),
636
+ configuration=self._service_locator.get_configuration(),
637
+ )
572
638
 
573
639
  def error_handler(
574
640
  self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -577,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
577
643
 
578
644
  The error handler is invoked after a request handler error occurs and before a retry attempt.
579
645
  """
580
- self._error_handler = handler
646
+ self._error_handler = self._wrap_handler_with_error_context(handler)
581
647
  return handler
582
648
 
583
649
  def failed_request_handler(
@@ -587,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
587
653
 
588
654
  The failed request handler is invoked when a request has failed all retry attempts.
589
655
  """
590
- self._failed_request_handler = handler
656
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
591
657
  return handler
592
658
 
593
659
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -627,7 +693,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
627
693
  request_manager = await self.get_request_manager()
628
694
  if purge_request_queue and isinstance(request_manager, RequestQueue):
629
695
  await request_manager.drop()
630
- self._request_manager = await RequestQueue.open()
696
+ self._request_manager = await RequestQueue.open(
697
+ storage_client=self._service_locator.get_storage_client(),
698
+ configuration=self._service_locator.get_configuration(),
699
+ )
631
700
 
632
701
  if requests is not None:
633
702
  await self.add_requests(requests)
@@ -654,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
654
723
  except CancelledError:
655
724
  pass
656
725
  finally:
657
- await self._crawler_state_rec_task.stop()
658
726
  if threading.current_thread() is threading.main_thread():
659
727
  with suppress(NotImplementedError):
660
728
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -684,9 +752,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
684
752
  return final_statistics
685
753
 
686
754
  async def _run_crawler(self) -> None:
687
- event_manager = service_locator.get_event_manager()
688
-
689
- self._crawler_state_rec_task.start()
755
+ event_manager = self._service_locator.get_event_manager()
690
756
 
691
757
  # Collect the context managers to be entered. Context managers that are already active are excluded,
692
758
  # as they were likely entered by the caller, who will also be responsible for exiting them.
@@ -698,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
698
764
  self._statistics,
699
765
  self._session_pool if self._use_session_pool else None,
700
766
  self._http_client,
767
+ self._crawler_state_rec_task,
701
768
  *self._additional_context_managers,
702
769
  )
703
770
  if cm and getattr(cm, 'active', False) is False
@@ -772,6 +839,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
772
839
  self,
773
840
  dataset_id: str | None = None,
774
841
  dataset_name: str | None = None,
842
+ dataset_alias: str | None = None,
775
843
  **kwargs: Unpack[GetDataKwargs],
776
844
  ) -> DatasetItemsListPage:
777
845
  """Retrieve data from a `Dataset`.
@@ -781,13 +849,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
781
849
 
782
850
  Args:
783
851
  dataset_id: The ID of the `Dataset`.
784
- dataset_name: The name of the `Dataset`.
852
+ dataset_name: The name of the `Dataset` (global scope, named storage).
853
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
785
854
  kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
786
855
 
787
856
  Returns:
788
857
  The retrieved data.
789
858
  """
790
- dataset = await Dataset.open(id=dataset_id, name=dataset_name)
859
+ dataset = await Dataset.open(
860
+ id=dataset_id,
861
+ name=dataset_name,
862
+ alias=dataset_alias,
863
+ storage_client=self._service_locator.get_storage_client(),
864
+ configuration=self._service_locator.get_configuration(),
865
+ )
791
866
  return await dataset.get_data(**kwargs)
792
867
 
793
868
  async def export_data(
@@ -795,6 +870,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
795
870
  path: str | Path,
796
871
  dataset_id: str | None = None,
797
872
  dataset_name: str | None = None,
873
+ dataset_alias: str | None = None,
874
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
798
875
  ) -> None:
799
876
  """Export all items from a Dataset to a JSON or CSV file.
800
877
 
@@ -804,18 +881,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
804
881
 
805
882
  Args:
806
883
  path: The destination file path. Must end with '.json' or '.csv'.
807
- dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
808
- dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
884
+ dataset_id: The ID of the Dataset to export from.
885
+ dataset_name: The name of the Dataset to export from (global scope, named storage).
886
+ dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
887
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
809
888
  """
810
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
889
+ dataset = await Dataset.open(
890
+ id=dataset_id,
891
+ name=dataset_name,
892
+ alias=dataset_alias,
893
+ storage_client=self._service_locator.get_storage_client(),
894
+ configuration=self._service_locator.get_configuration(),
895
+ )
811
896
 
812
- path = path if isinstance(path, Path) else Path(path)
813
- dst = path.open('w', newline='')
897
+ path = Path(path)
814
898
 
815
899
  if path.suffix == '.csv':
816
- await export_csv_to_stream(dataset.iterate_items(), dst)
900
+ dst = StringIO()
901
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
902
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
903
+ await atomic_write(path, dst.getvalue())
817
904
  elif path.suffix == '.json':
818
- await export_json_to_stream(dataset.iterate_items(), dst)
905
+ dst = StringIO()
906
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
907
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
908
+ await atomic_write(path, dst.getvalue())
819
909
  else:
820
910
  raise ValueError(f'Unsupported file extension: {path.suffix}')
821
911
 
@@ -824,6 +914,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
824
914
  data: list[dict[str, Any]] | dict[str, Any],
825
915
  dataset_id: str | None = None,
826
916
  dataset_name: str | None = None,
917
+ dataset_alias: str | None = None,
827
918
  **kwargs: Unpack[PushDataKwargs],
828
919
  ) -> None:
829
920
  """Push data to a `Dataset`.
@@ -834,10 +925,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
834
925
  Args:
835
926
  data: The data to push to the `Dataset`.
836
927
  dataset_id: The ID of the `Dataset`.
837
- dataset_name: The name of the `Dataset`.
928
+ dataset_name: The name of the `Dataset` (global scope, named storage).
929
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
838
930
  kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
839
931
  """
840
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
932
+ dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
841
933
  await dataset.push_data(data, **kwargs)
842
934
 
843
935
  def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -894,6 +986,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
894
986
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
895
987
  | None = None,
896
988
  requests: Sequence[str | Request] | None = None,
989
+ rq_id: str | None = None,
990
+ rq_name: str | None = None,
991
+ rq_alias: str | None = None,
897
992
  **kwargs: Unpack[EnqueueLinksKwargs],
898
993
  ) -> None:
899
994
  kwargs.setdefault('strategy', 'same-hostname')
@@ -905,7 +1000,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
905
1000
  '`transform_request_function` arguments when `requests` is provided.'
906
1001
  )
907
1002
  # Add directly passed requests.
908
- await context.add_requests(requests or list[str | Request](), **kwargs)
1003
+ await context.add_requests(
1004
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
1005
+ )
909
1006
  else:
910
1007
  # Add requests from extracted links.
911
1008
  await context.add_requests(
@@ -914,7 +1011,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
914
1011
  label=label,
915
1012
  user_data=user_data,
916
1013
  transform_request_function=transform_request_function,
1014
+ **kwargs,
917
1015
  ),
1016
+ rq_id=rq_id,
1017
+ rq_name=rq_name,
1018
+ rq_alias=rq_alias,
918
1019
  **kwargs,
919
1020
  )
920
1021
 
@@ -974,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
974
1075
  return target_url.hostname == origin_url.hostname
975
1076
 
976
1077
  if strategy == 'same-domain':
977
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
978
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1078
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1079
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
979
1080
  return origin_domain == target_domain
980
1081
 
981
1082
  if strategy == 'same-origin':
@@ -1031,8 +1132,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1031
1132
 
1032
1133
  if self._should_retry_request(context, error):
1033
1134
  request.retry_count += 1
1135
+ reduced_error = str(error).split('\n')[0]
1034
1136
  self.log.warning(
1035
- f'Retrying request to {context.request.url} due to: {error} \n'
1137
+ f'Retrying request to {context.request.url} due to: {reduced_error}'
1036
1138
  f'{get_one_line_error_summary_if_possible(error)}'
1037
1139
  )
1038
1140
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1043,21 +1145,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1043
1145
  except Exception as e:
1044
1146
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1045
1147
  else:
1046
- if new_request is not None:
1047
- request = new_request
1148
+ if new_request is not None and new_request != request:
1149
+ await request_manager.add_request(new_request)
1150
+ await self._mark_request_as_handled(request)
1151
+ return
1048
1152
 
1049
1153
  await request_manager.reclaim_request(request)
1050
1154
  else:
1051
- await wait_for(
1052
- lambda: request_manager.mark_request_as_handled(context.request),
1053
- timeout=self._internal_timeout,
1054
- timeout_message='Marking request as handled timed out after '
1055
- f'{self._internal_timeout.total_seconds()} seconds',
1056
- logger=self._logger,
1057
- max_retries=3,
1058
- )
1155
+ await self._mark_request_as_handled(request)
1059
1156
  await self._handle_failed_request(context, error)
1060
- self._statistics.record_request_processing_failure(request.id or request.unique_key)
1157
+ self._statistics.record_request_processing_failure(request.unique_key)
1061
1158
 
1062
1159
  async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
1063
1160
  try:
@@ -1104,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1104
1201
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1105
1202
  ) -> None:
1106
1203
  if need_mark and isinstance(request, Request):
1107
- request_manager = await self.get_request_manager()
1108
-
1109
- await wait_for(
1110
- lambda: request_manager.mark_request_as_handled(request),
1111
- timeout=self._internal_timeout,
1112
- timeout_message='Marking request as handled timed out after '
1113
- f'{self._internal_timeout.total_seconds()} seconds',
1114
- logger=self._logger,
1115
- max_retries=3,
1116
- )
1204
+ await self._mark_request_as_handled(request)
1117
1205
  request.state = RequestState.SKIPPED
1118
1206
 
1119
1207
  url = request.url if isinstance(request, Request) else request
@@ -1186,34 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1186
1274
  else:
1187
1275
  yield Request.from_url(url)
1188
1276
 
1189
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1190
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1191
- result = self._context_result_map[context]
1192
-
1193
- request_manager = await self.get_request_manager()
1194
- origin = context.request.loaded_url or context.request.url
1195
-
1196
- for add_requests_call in result.add_requests_calls:
1197
- requests = list[Request]()
1198
-
1199
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1200
-
1201
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1277
+ async def _add_requests(
1278
+ self,
1279
+ context: BasicCrawlingContext,
1280
+ requests: Sequence[str | Request],
1281
+ rq_id: str | None = None,
1282
+ rq_name: str | None = None,
1283
+ rq_alias: str | None = None,
1284
+ **kwargs: Unpack[EnqueueLinksKwargs],
1285
+ ) -> None:
1286
+ """Add requests method aware of the crawling context."""
1287
+ if rq_id or rq_name or rq_alias:
1288
+ request_manager: RequestManager = await RequestQueue.open(
1289
+ id=rq_id,
1290
+ name=rq_name,
1291
+ alias=rq_alias,
1292
+ storage_client=self._service_locator.get_storage_client(),
1293
+ configuration=self._service_locator.get_configuration(),
1294
+ )
1295
+ else:
1296
+ request_manager = await self.get_request_manager()
1202
1297
 
1203
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1298
+ context_aware_requests = list[Request]()
1299
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1300
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1301
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1302
+ for dst_request in filter_requests_iterator:
1303
+ # Update the crawl depth of the request.
1304
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1204
1305
 
1205
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1206
- requests_iterator, context.request.url, **enqueue_links_kwargs
1207
- )
1306
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1307
+ context_aware_requests.append(dst_request)
1208
1308
 
1209
- for dst_request in filter_requests_iterator:
1210
- # Update the crawl depth of the request.
1211
- dst_request.crawl_depth = context.request.crawl_depth + 1
1309
+ return await request_manager.add_requests(context_aware_requests)
1212
1310
 
1213
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1214
- requests.append(dst_request)
1311
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1312
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1313
+ result = self._context_result_map[context]
1215
1314
 
1216
- await request_manager.add_requests(requests)
1315
+ for add_requests_call in result.add_requests_calls:
1316
+ await self._add_requests(context, **add_requests_call)
1217
1317
 
1218
1318
  for push_data_call in result.push_data_calls:
1219
1319
  await self._push_data(**push_data_call)
@@ -1225,8 +1325,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1225
1325
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
1226
1326
  ) -> None:
1227
1327
  """Store key value store changes recorded in result."""
1228
- for (id, name), changes in result.key_value_store_changes.items():
1229
- store = await get_kvs(id=id, name=name)
1328
+ for (id, name, alias), changes in result.key_value_store_changes.items():
1329
+ store = await get_kvs(id=id, name=name, alias=alias)
1230
1330
  for key, value in changes.updates.items():
1231
1331
  await store.set_value(key, value.content, value.content_type)
1232
1332
 
@@ -1274,7 +1374,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1274
1374
 
1275
1375
  if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
1276
1376
  self._logger.warning(
1277
- f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
1377
+ f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
1278
1378
  )
1279
1379
 
1280
1380
  await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
@@ -1300,8 +1400,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1300
1400
  )
1301
1401
  self._context_result_map[context] = result
1302
1402
 
1303
- statistics_id = request.id or request.unique_key
1304
- self._statistics.record_request_processing_start(statistics_id)
1403
+ self._statistics.record_request_processing_start(request.unique_key)
1305
1404
 
1306
1405
  try:
1307
1406
  request.state = RequestState.REQUEST_HANDLER
@@ -1314,21 +1413,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1314
1413
  raise RequestHandlerError(e, context) from e
1315
1414
 
1316
1415
  await self._commit_request_handler_result(context)
1317
- await wait_for(
1318
- lambda: request_manager.mark_request_as_handled(context.request),
1319
- timeout=self._internal_timeout,
1320
- timeout_message='Marking request as handled timed out after '
1321
- f'{self._internal_timeout.total_seconds()} seconds',
1322
- logger=self._logger,
1323
- max_retries=3,
1324
- )
1416
+
1417
+ await self._mark_request_as_handled(request)
1325
1418
 
1326
1419
  request.state = RequestState.DONE
1327
1420
 
1328
1421
  if context.session and context.session.is_usable:
1329
1422
  context.session.mark_good()
1330
1423
 
1331
- self._statistics.record_request_processing_finish(statistics_id)
1424
+ self._statistics.record_request_processing_finish(request.unique_key)
1332
1425
 
1333
1426
  except RequestCollisionError as request_error:
1334
1427
  context.request.no_retry = True
@@ -1364,29 +1457,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1364
1457
  await request_manager.reclaim_request(request)
1365
1458
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1366
1459
  else:
1367
- await wait_for(
1368
- lambda: request_manager.mark_request_as_handled(context.request),
1369
- timeout=self._internal_timeout,
1370
- timeout_message='Marking request as handled timed out after '
1371
- f'{self._internal_timeout.total_seconds()} seconds',
1372
- logger=self._logger,
1373
- max_retries=3,
1374
- )
1460
+ await self._mark_request_as_handled(request)
1375
1461
 
1376
1462
  await self._handle_failed_request(context, session_error)
1377
- self._statistics.record_request_processing_failure(statistics_id)
1463
+ self._statistics.record_request_processing_failure(request.unique_key)
1378
1464
 
1379
1465
  except ContextPipelineInterruptedError as interrupted_error:
1380
1466
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1381
1467
 
1382
- await wait_for(
1383
- lambda: request_manager.mark_request_as_handled(context.request),
1384
- timeout=self._internal_timeout,
1385
- timeout_message='Marking request as handled timed out after '
1386
- f'{self._internal_timeout.total_seconds()} seconds',
1387
- logger=self._logger,
1388
- max_retries=3,
1389
- )
1468
+ await self._mark_request_as_handled(request)
1390
1469
 
1391
1470
  except ContextPipelineInitializationError as initialization_error:
1392
1471
  self._logger.debug(
@@ -1404,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1404
1483
  raise
1405
1484
 
1406
1485
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1407
- await wait_for(
1408
- lambda: self._context_pipeline(context, self.router),
1409
- timeout=self._request_handler_timeout,
1410
- timeout_message=f'{self._request_handler_timeout_text}'
1411
- f' {self._request_handler_timeout.total_seconds()} seconds',
1412
- logger=self._logger,
1486
+ await self._context_pipeline(
1487
+ context,
1488
+ lambda final_context: wait_for(
1489
+ lambda: self.router(final_context),
1490
+ timeout=self._request_handler_timeout,
1491
+ timeout_message=f'{self._request_handler_timeout_text}'
1492
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1493
+ logger=self._logger,
1494
+ ),
1413
1495
  )
1414
1496
 
1415
1497
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1520,7 +1602,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1520
1602
 
1521
1603
  async def _crawler_state_task(self) -> None:
1522
1604
  """Emit a persist state event with the given migration status."""
1523
- event_manager = service_locator.get_event_manager()
1605
+ event_manager = self._service_locator.get_event_manager()
1524
1606
 
1525
1607
  current_state = self.statistics.state
1526
1608
 
@@ -1557,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1557
1639
  )
1558
1640
 
1559
1641
  self._previous_crawler_state = current_state
1642
+
1643
+ async def _mark_request_as_handled(self, request: Request) -> None:
1644
+ request_manager = await self.get_request_manager()
1645
+ await wait_for(
1646
+ lambda: request_manager.mark_request_as_handled(request),
1647
+ timeout=self._internal_timeout,
1648
+ timeout_message='Marking request as handled timed out after '
1649
+ f'{self._internal_timeout.total_seconds()} seconds',
1650
+ logger=self._logger,
1651
+ max_retries=3,
1652
+ )