crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +34 -22
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +86 -33
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/system.py +3 -3
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +2 -0
  17. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
  18. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  19. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  22. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  23. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  24. crawlee/crawlers/_basic/_basic_crawler.py +124 -37
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/events/_types.py +6 -6
  28. crawlee/fingerprint_suite/_header_generator.py +2 -2
  29. crawlee/fingerprint_suite/_types.py +2 -2
  30. crawlee/otel/crawler_instrumentor.py +3 -3
  31. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  32. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  33. crawlee/request_loaders/_request_list.py +1 -1
  34. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  35. crawlee/sessions/_models.py +2 -2
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +33 -2
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +13 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
  45. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  46. crawlee/storage_clients/_file_system/_utils.py +0 -0
  47. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  48. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  49. crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
  50. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  51. crawlee/storage_clients/_redis/__init__.py +6 -0
  52. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  53. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  54. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  55. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  56. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  57. crawlee/storage_clients/_redis/_utils.py +23 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  59. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  60. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  61. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  62. crawlee/storage_clients/_redis/py.typed +0 -0
  63. crawlee/storage_clients/_sql/__init__.py +6 -0
  64. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  65. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  66. crawlee/storage_clients/_sql/_db_models.py +268 -0
  67. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  68. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  69. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  70. crawlee/storage_clients/_sql/py.typed +0 -0
  71. crawlee/storage_clients/models.py +10 -10
  72. crawlee/storages/_base.py +5 -1
  73. crawlee/storages/_dataset.py +12 -2
  74. crawlee/storages/_key_value_store.py +17 -4
  75. crawlee/storages/_request_queue.py +10 -2
  76. crawlee/storages/_storage_instance_manager.py +133 -71
  77. crawlee/storages/_utils.py +11 -0
  78. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
  79. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
  80. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  81. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  82. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -27,6 +27,7 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
27
27
  from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
28
28
  from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
29
29
  from crawlee._request import Request, RequestOptions, RequestState
30
+ from crawlee._service_locator import ServiceLocator
30
31
  from crawlee._types import (
31
32
  BasicCrawlingContext,
32
33
  EnqueueLinksKwargs,
@@ -204,7 +205,7 @@ class _BasicCrawlerOptions(TypedDict):
204
205
  Returning `None` suppresses the status message."""
205
206
 
206
207
 
207
- class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
208
+ class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
208
209
  """Generic options the `BasicCrawler` constructor."""
209
210
 
210
211
  request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
@@ -219,9 +220,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
219
220
 
220
221
 
221
222
  class BasicCrawlerOptions(
222
- Generic[TCrawlingContext, TStatisticsState],
223
223
  _BasicCrawlerOptions,
224
224
  _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
225
+ Generic[TCrawlingContext, TStatisticsState],
225
226
  ):
226
227
  """Arguments for the `BasicCrawler` constructor.
227
228
 
@@ -346,14 +347,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
346
347
  _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
347
348
  Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
348
349
  """
349
- if configuration:
350
- service_locator.set_configuration(configuration)
351
- if storage_client:
352
- service_locator.set_storage_client(storage_client)
353
- if event_manager:
354
- service_locator.set_event_manager(event_manager)
350
+ implicit_event_manager_with_explicit_config = False
351
+ if not configuration:
352
+ configuration = service_locator.get_configuration()
353
+ elif not event_manager:
354
+ implicit_event_manager_with_explicit_config = True
355
355
 
356
- config = service_locator.get_configuration()
356
+ if not storage_client:
357
+ storage_client = service_locator.get_storage_client()
358
+
359
+ if not event_manager:
360
+ event_manager = service_locator.get_event_manager()
361
+
362
+ self._service_locator = ServiceLocator(
363
+ configuration=configuration, storage_client=storage_client, event_manager=event_manager
364
+ )
365
+
366
+ config = self._service_locator.get_configuration()
357
367
 
358
368
  # Core components
359
369
  self._request_manager = request_manager
@@ -419,17 +429,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
419
429
  httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
420
430
  httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
421
431
  self._logger = _logger or logging.getLogger(__name__)
432
+ if implicit_event_manager_with_explicit_config:
433
+ self._logger.warning(
434
+ 'No event manager set, implicitly using event manager from global service_locator.'
435
+ 'It is advised to explicitly set the event manager if explicit configuration is used as well.'
436
+ )
422
437
  self._statistics_log_format = statistics_log_format
423
438
 
424
439
  # Statistics
425
- self._statistics = statistics or cast(
426
- 'Statistics[TStatisticsState]',
427
- Statistics.with_default_state(
428
- periodic_message_logger=self._logger,
429
- statistics_log_format=self._statistics_log_format,
430
- log_message='Current request statistics:',
431
- ),
432
- )
440
+ if statistics:
441
+ self._statistics = statistics
442
+ else:
443
+
444
+ async def persist_state_factory() -> KeyValueStore:
445
+ return await self.get_key_value_store()
446
+
447
+ self._statistics = cast(
448
+ 'Statistics[TStatisticsState]',
449
+ Statistics.with_default_state(
450
+ persistence_enabled=True,
451
+ periodic_message_logger=self._logger,
452
+ statistics_log_format=self._statistics_log_format,
453
+ log_message='Current request statistics:',
454
+ persist_state_kvs_factory=persist_state_factory,
455
+ ),
456
+ )
433
457
 
434
458
  # Additional context managers to enter and exit
435
459
  self._additional_context_managers = _additional_context_managers or []
@@ -548,7 +572,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
548
572
  async def get_request_manager(self) -> RequestManager:
549
573
  """Return the configured request manager. If none is configured, open and return the default request queue."""
550
574
  if not self._request_manager:
551
- self._request_manager = await RequestQueue.open()
575
+ self._request_manager = await RequestQueue.open(
576
+ storage_client=self._service_locator.get_storage_client(),
577
+ configuration=self._service_locator.get_configuration(),
578
+ )
552
579
 
553
580
  return self._request_manager
554
581
 
@@ -557,18 +584,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
557
584
  *,
558
585
  id: str | None = None,
559
586
  name: str | None = None,
587
+ alias: str | None = None,
560
588
  ) -> Dataset:
561
589
  """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
562
- return await Dataset.open(id=id, name=name)
590
+ return await Dataset.open(
591
+ id=id,
592
+ name=name,
593
+ alias=alias,
594
+ storage_client=self._service_locator.get_storage_client(),
595
+ configuration=self._service_locator.get_configuration(),
596
+ )
563
597
 
564
598
  async def get_key_value_store(
565
599
  self,
566
600
  *,
567
601
  id: str | None = None,
568
602
  name: str | None = None,
603
+ alias: str | None = None,
569
604
  ) -> KeyValueStore:
570
605
  """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
571
- return await KeyValueStore.open(id=id, name=name)
606
+ return await KeyValueStore.open(
607
+ id=id,
608
+ name=name,
609
+ alias=alias,
610
+ storage_client=self._service_locator.get_storage_client(),
611
+ configuration=self._service_locator.get_configuration(),
612
+ )
572
613
 
573
614
  def error_handler(
574
615
  self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -627,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
627
668
  request_manager = await self.get_request_manager()
628
669
  if purge_request_queue and isinstance(request_manager, RequestQueue):
629
670
  await request_manager.drop()
630
- self._request_manager = await RequestQueue.open()
671
+ self._request_manager = await RequestQueue.open(
672
+ storage_client=self._service_locator.get_storage_client(),
673
+ configuration=self._service_locator.get_configuration(),
674
+ )
631
675
 
632
676
  if requests is not None:
633
677
  await self.add_requests(requests)
@@ -654,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
654
698
  except CancelledError:
655
699
  pass
656
700
  finally:
657
- await self._crawler_state_rec_task.stop()
658
701
  if threading.current_thread() is threading.main_thread():
659
702
  with suppress(NotImplementedError):
660
703
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -684,9 +727,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
684
727
  return final_statistics
685
728
 
686
729
  async def _run_crawler(self) -> None:
687
- event_manager = service_locator.get_event_manager()
688
-
689
- self._crawler_state_rec_task.start()
730
+ event_manager = self._service_locator.get_event_manager()
690
731
 
691
732
  # Collect the context managers to be entered. Context managers that are already active are excluded,
692
733
  # as they were likely entered by the caller, who will also be responsible for exiting them.
@@ -698,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
698
739
  self._statistics,
699
740
  self._session_pool if self._use_session_pool else None,
700
741
  self._http_client,
742
+ self._crawler_state_rec_task,
701
743
  *self._additional_context_managers,
702
744
  )
703
745
  if cm and getattr(cm, 'active', False) is False
@@ -772,6 +814,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
772
814
  self,
773
815
  dataset_id: str | None = None,
774
816
  dataset_name: str | None = None,
817
+ dataset_alias: str | None = None,
775
818
  **kwargs: Unpack[GetDataKwargs],
776
819
  ) -> DatasetItemsListPage:
777
820
  """Retrieve data from a `Dataset`.
@@ -781,13 +824,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
781
824
 
782
825
  Args:
783
826
  dataset_id: The ID of the `Dataset`.
784
- dataset_name: The name of the `Dataset`.
827
+ dataset_name: The name of the `Dataset` (global scope, named storage).
828
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
785
829
  kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
786
830
 
787
831
  Returns:
788
832
  The retrieved data.
789
833
  """
790
- dataset = await Dataset.open(id=dataset_id, name=dataset_name)
834
+ dataset = await Dataset.open(
835
+ id=dataset_id,
836
+ name=dataset_name,
837
+ alias=dataset_alias,
838
+ storage_client=self._service_locator.get_storage_client(),
839
+ configuration=self._service_locator.get_configuration(),
840
+ )
791
841
  return await dataset.get_data(**kwargs)
792
842
 
793
843
  async def export_data(
@@ -795,6 +845,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
795
845
  path: str | Path,
796
846
  dataset_id: str | None = None,
797
847
  dataset_name: str | None = None,
848
+ dataset_alias: str | None = None,
798
849
  ) -> None:
799
850
  """Export all items from a Dataset to a JSON or CSV file.
800
851
 
@@ -804,10 +855,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
804
855
 
805
856
  Args:
806
857
  path: The destination file path. Must end with '.json' or '.csv'.
807
- dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
808
- dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
858
+ dataset_id: The ID of the Dataset to export from.
859
+ dataset_name: The name of the Dataset to export from (global scope, named storage).
860
+ dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
809
861
  """
810
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
862
+ dataset = await Dataset.open(
863
+ id=dataset_id,
864
+ name=dataset_name,
865
+ alias=dataset_alias,
866
+ storage_client=self._service_locator.get_storage_client(),
867
+ configuration=self._service_locator.get_configuration(),
868
+ )
811
869
 
812
870
  path = path if isinstance(path, Path) else Path(path)
813
871
  dst = path.open('w', newline='')
@@ -824,6 +882,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
824
882
  data: list[dict[str, Any]] | dict[str, Any],
825
883
  dataset_id: str | None = None,
826
884
  dataset_name: str | None = None,
885
+ dataset_alias: str | None = None,
827
886
  **kwargs: Unpack[PushDataKwargs],
828
887
  ) -> None:
829
888
  """Push data to a `Dataset`.
@@ -834,10 +893,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
834
893
  Args:
835
894
  data: The data to push to the `Dataset`.
836
895
  dataset_id: The ID of the `Dataset`.
837
- dataset_name: The name of the `Dataset`.
896
+ dataset_name: The name of the `Dataset` (global scope, named storage).
897
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
838
898
  kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
839
899
  """
840
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
900
+ dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
841
901
  await dataset.push_data(data, **kwargs)
842
902
 
843
903
  def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -894,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
894
954
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
895
955
  | None = None,
896
956
  requests: Sequence[str | Request] | None = None,
957
+ rq_id: str | None = None,
958
+ rq_name: str | None = None,
959
+ rq_alias: str | None = None,
897
960
  **kwargs: Unpack[EnqueueLinksKwargs],
898
961
  ) -> None:
899
962
  kwargs.setdefault('strategy', 'same-hostname')
@@ -905,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
905
968
  '`transform_request_function` arguments when `requests` is provided.'
906
969
  )
907
970
  # Add directly passed requests.
908
- await context.add_requests(requests or list[str | Request](), **kwargs)
971
+ await context.add_requests(
972
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
973
+ )
909
974
  else:
910
975
  # Add requests from extracted links.
911
976
  await context.add_requests(
@@ -914,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
914
979
  label=label,
915
980
  user_data=user_data,
916
981
  transform_request_function=transform_request_function,
982
+ **kwargs,
917
983
  ),
984
+ rq_id=rq_id,
985
+ rq_name=rq_name,
986
+ rq_alias=rq_alias,
918
987
  **kwargs,
919
988
  )
920
989
 
@@ -1191,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1191
1260
  """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1192
1261
  result = self._context_result_map[context]
1193
1262
 
1194
- request_manager = await self.get_request_manager()
1263
+ base_request_manager = await self.get_request_manager()
1264
+
1195
1265
  origin = context.request.loaded_url or context.request.url
1196
1266
 
1197
1267
  for add_requests_call in result.add_requests_calls:
1268
+ rq_id = add_requests_call.get('rq_id')
1269
+ rq_name = add_requests_call.get('rq_name')
1270
+ rq_alias = add_requests_call.get('rq_alias')
1271
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1272
+ if specified_params > 1:
1273
+ raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1274
+ if rq_id or rq_name or rq_alias:
1275
+ request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1276
+ id=rq_id,
1277
+ name=rq_name,
1278
+ alias=rq_alias,
1279
+ storage_client=self._service_locator.get_storage_client(),
1280
+ configuration=self._service_locator.get_configuration(),
1281
+ )
1282
+ else:
1283
+ request_manager = base_request_manager
1284
+
1198
1285
  requests = list[Request]()
1199
1286
 
1200
1287
  base_url = url if (url := add_requests_call.get('base_url')) else origin
@@ -1226,8 +1313,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1226
1313
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
1227
1314
  ) -> None:
1228
1315
  """Store key value store changes recorded in result."""
1229
- for (id, name), changes in result.key_value_store_changes.items():
1230
- store = await get_kvs(id=id, name=name)
1316
+ for (id, name, alias), changes in result.key_value_store_changes.items():
1317
+ store = await get_kvs(id=id, name=name, alias=alias)
1231
1318
  for key, value in changes.updates.items():
1232
1319
  await store.set_value(key, value.content, value.content_type)
1233
1320
 
@@ -1520,7 +1607,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1520
1607
 
1521
1608
  async def _crawler_state_task(self) -> None:
1522
1609
  """Emit a persist state event with the given migration status."""
1523
- event_manager = service_locator.get_event_manager()
1610
+ event_manager = self._service_locator.get_event_manager()
1524
1611
 
1525
1612
  current_state = self.statistics.state
1526
1613
 
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
12
12
 
13
13
  from crawlee import service_locator
14
14
  from crawlee._request import Request, RequestOptions
15
+ from crawlee._types import ConcurrencySettings
15
16
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
17
  from crawlee._utils.docs import docs_group
17
18
  from crawlee._utils.robots import RobotsTxtFile
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
113
114
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
114
115
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
115
116
  and local storage.
116
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
117
+ browser_type: The type of browser to launch:
118
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
119
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
120
+ the system.
117
121
  This option should not be used if `browser_pool` is provided.
118
122
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
119
123
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
152
156
  ):
153
157
  raise ValueError(
154
158
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
155
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
159
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
156
160
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
157
161
  )
158
162
 
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
198
 
195
199
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
200
 
201
+ # Set default concurrency settings for browser crawlers if not provided
202
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
203
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
204
+
197
205
  super().__init__(**kwargs)
198
206
 
199
207
  async def _open_page(
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
361
369
  links_iterator: Iterator[str] = iter(
362
370
  [url for element in elements if (url := await element.get_attribute('href')) is not None]
363
371
  )
364
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
372
+ links_iterator = to_absolute_url_iterator(
373
+ context.request.loaded_url or context.request.url, links_iterator, logger=context.log
374
+ )
365
375
 
366
376
  if robots_txt_file:
367
377
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
489
499
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
490
500
 
491
501
  browser_type: NotRequired[BrowserType]
492
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
502
+ """The type of browser to launch:
503
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
504
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
493
505
  This option should not be used if `browser_pool` is provided."""
494
506
 
495
507
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -509,9 +521,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
509
521
 
510
522
 
511
523
  class PlaywrightCrawlerOptions(
512
- Generic[TCrawlingContext, TStatisticsState],
513
524
  _PlaywrightCrawlerAdditionalOptions,
514
525
  BasicCrawlerOptions[TCrawlingContext, StatisticsState],
526
+ Generic[TCrawlingContext, TStatisticsState],
515
527
  ):
516
528
  """Arguments for the `AbstractHttpCrawler` constructor.
517
529
 
@@ -130,11 +130,13 @@ class EventManager:
130
130
  if not self._active:
131
131
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132
132
 
133
+ # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134
+ await self._emit_persist_state_event_rec_task.stop()
135
+ await self._emit_persist_state_event()
133
136
  await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
134
137
  self._event_emitter.remove_all_listeners()
135
138
  self._listener_tasks.clear()
136
139
  self._listeners_to_wrappers.clear()
137
- await self._emit_persist_state_event_rec_task.stop()
138
140
  self._active = False
139
141
 
140
142
  @overload
crawlee/events/_types.py CHANGED
@@ -40,7 +40,7 @@ class Event(str, Enum):
40
40
  class EventPersistStateData(BaseModel):
41
41
  """Data for the persist state event."""
42
42
 
43
- model_config = ConfigDict(populate_by_name=True)
43
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
44
44
 
45
45
  is_migrating: Annotated[bool, Field(alias='isMigrating')]
46
46
 
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
49
49
  class EventSystemInfoData(BaseModel):
50
50
  """Data for the system info event."""
51
51
 
52
- model_config = ConfigDict(populate_by_name=True)
52
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
53
53
 
54
54
  cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
55
55
  memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
62
62
  class EventMigratingData(BaseModel):
63
63
  """Data for the migrating event."""
64
64
 
65
- model_config = ConfigDict(populate_by_name=True)
65
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
66
66
 
67
67
  # The remaining time in seconds before the migration is forced and the process is killed
68
68
  # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
73
73
  class EventAbortingData(BaseModel):
74
74
  """Data for the aborting event."""
75
75
 
76
- model_config = ConfigDict(populate_by_name=True)
76
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
77
77
 
78
78
 
79
79
  @docs_group('Event data')
80
80
  class EventExitData(BaseModel):
81
81
  """Data for the exit event."""
82
82
 
83
- model_config = ConfigDict(populate_by_name=True)
83
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
84
84
 
85
85
 
86
86
  @docs_group('Event data')
87
87
  class EventCrawlerStatusData(BaseModel):
88
88
  """Data for the crawler status event."""
89
89
 
90
- model_config = ConfigDict(populate_by_name=True)
90
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
91
91
 
92
92
  message: str
93
93
  """A message describing the current status of the crawler."""
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
11
11
 
12
12
 
13
13
  class ScreenOptions(BaseModel):
14
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
14
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
15
15
 
16
16
  """Defines the screen constrains for the fingerprint generator."""
17
17
 
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
31
31
  class HeaderGeneratorOptions(BaseModel):
32
32
  """Collection of header related attributes that can be used by the fingerprint generator."""
33
33
 
34
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
34
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
35
35
 
36
36
  browsers: list[SupportedBrowserType] | None = None
37
37
  """List of BrowserSpecifications to generate the headers for."""
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
69
 
70
70
  if request_handling_instrumentation:
71
71
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
72
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
73
  with self._tracer.start_as_current_span(
74
74
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
75
  attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
111
  # Handpicked interesting methods to instrument
112
112
  self._instrumented.extend(
113
113
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
114
+ (_Middleware, 'action', middleware_wrapper),
115
+ (_Middleware, 'cleanup', middleware_wrapper),
116
116
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
117
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
118
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -5,8 +5,8 @@
5
5
  # % endif
6
6
  # % if cookiecutter.http_client == 'curl-impersonate'
7
7
  # % do extras.append('curl-impersonate')
8
- # % elif cookiecutter.http_client == 'impit'
9
- # % do extras.append('impit')
8
+ # % elif cookiecutter.http_client == 'httpx'
9
+ # % do extras.append('httpx')
10
10
  # % endif
11
11
 
12
12
  [project]
@@ -10,4 +10,7 @@ apify
10
10
  # % if cookiecutter.http_client == 'curl-impersonate'
11
11
  # % do extras.append('curl-impersonate')
12
12
  # % endif
13
+ # % if cookiecutter.http_client == 'httpx'
14
+ # % do extras.append('httpx')
15
+ # % endif
13
16
  crawlee[{{ extras | join(',') }}]
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
17
17
 
18
18
 
19
19
  class RequestListState(BaseModel):
20
- model_config = ConfigDict(populate_by_name=True)
20
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
21
21
 
22
22
  next_index: Annotated[int, Field(alias='nextIndex')] = 0
23
23
  next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -56,7 +57,7 @@ class SitemapRequestLoaderState(BaseModel):
56
57
  `in_progress` is cleared.
57
58
  """
58
59
 
59
- model_config = ConfigDict(populate_by_name=True)
60
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
60
61
 
61
62
  url_queue: Annotated[deque[str], Field(alias='urlQueue')]
62
63
  """Queue of URLs extracted from sitemaps and ready for processing."""
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
90
91
  class SitemapRequestLoader(RequestLoader):
91
92
  """A request loader that reads URLs from sitemap(s).
92
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
93
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
101
 
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
107
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
108
114
  max_buffer_size: int = 200,
109
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
110
117
  ) -> None:
111
118
  """Initialize the sitemap request loader.
112
119
 
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
120
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
121
128
  When provided, allows resuming from where it left off after interruption.
122
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
123
133
  """
124
134
  self._http_client = http_client
125
135
  self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
127
137
  self._exclude = exclude
128
138
  self._proxy_info = proxy_info
129
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
130
141
 
131
142
  # Synchronization for queue operations
132
143
  self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
308
319
 
309
320
  async with self._queue_lock:
310
321
  url = state.url_queue.popleft()
311
-
312
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
313
331
  state.in_progress.add(request.url)
314
332
  if len(state.url_queue) < self._max_buffer_size:
315
333
  self._queue_has_capacity.set()