crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +2 -1
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +76 -17
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/sitemap.py +3 -1
  7. crawlee/_utils/system.py +3 -3
  8. crawlee/configuration.py +1 -1
  9. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  11. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  12. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
  13. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  14. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  15. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  16. crawlee/crawlers/_basic/_basic_crawler.py +107 -27
  17. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  18. crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
  19. crawlee/events/_types.py +6 -6
  20. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  21. crawlee/fingerprint_suite/_types.py +2 -2
  22. crawlee/request_loaders/_request_list.py +1 -1
  23. crawlee/request_loaders/_request_loader.py +5 -1
  24. crawlee/request_loaders/_sitemap_request_loader.py +228 -48
  25. crawlee/sessions/_models.py +2 -2
  26. crawlee/statistics/_models.py +1 -1
  27. crawlee/storage_clients/__init__.py +12 -0
  28. crawlee/storage_clients/_base/_storage_client.py +13 -0
  29. crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
  30. crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
  31. crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
  32. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  33. crawlee/storage_clients/_file_system/_utils.py +0 -0
  34. crawlee/storage_clients/_memory/_dataset_client.py +14 -2
  35. crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
  36. crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
  37. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  38. crawlee/storage_clients/_sql/__init__.py +6 -0
  39. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  40. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  41. crawlee/storage_clients/_sql/_db_models.py +269 -0
  42. crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
  43. crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
  44. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  45. crawlee/storage_clients/_sql/py.typed +0 -0
  46. crawlee/storage_clients/models.py +10 -10
  47. crawlee/storages/_base.py +3 -1
  48. crawlee/storages/_dataset.py +9 -2
  49. crawlee/storages/_key_value_store.py +9 -2
  50. crawlee/storages/_request_queue.py +7 -2
  51. crawlee/storages/_storage_instance_manager.py +126 -72
  52. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/METADATA +12 -5
  53. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/RECORD +56 -46
  54. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/WHEEL +0 -0
  55. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/entry_points.txt +0 -0
  56. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -27,6 +27,7 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
27
27
  from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
28
28
  from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
29
29
  from crawlee._request import Request, RequestOptions, RequestState
30
+ from crawlee._service_locator import ServiceLocator
30
31
  from crawlee._types import (
31
32
  BasicCrawlingContext,
32
33
  EnqueueLinksKwargs,
@@ -204,7 +205,7 @@ class _BasicCrawlerOptions(TypedDict):
204
205
  Returning `None` suppresses the status message."""
205
206
 
206
207
 
207
- class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
208
+ class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
208
209
  """Generic options the `BasicCrawler` constructor."""
209
210
 
210
211
  request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
@@ -219,9 +220,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
219
220
 
220
221
 
221
222
  class BasicCrawlerOptions(
222
- Generic[TCrawlingContext, TStatisticsState],
223
223
  _BasicCrawlerOptions,
224
224
  _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
225
+ Generic[TCrawlingContext, TStatisticsState],
225
226
  ):
226
227
  """Arguments for the `BasicCrawler` constructor.
227
228
 
@@ -346,14 +347,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
346
347
  _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
347
348
  Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
348
349
  """
349
- if configuration:
350
- service_locator.set_configuration(configuration)
351
- if storage_client:
352
- service_locator.set_storage_client(storage_client)
353
- if event_manager:
354
- service_locator.set_event_manager(event_manager)
350
+ implicit_event_manager_with_explicit_config = False
351
+ if not configuration:
352
+ configuration = service_locator.get_configuration()
353
+ elif not event_manager:
354
+ implicit_event_manager_with_explicit_config = True
355
+
356
+ if not storage_client:
357
+ storage_client = service_locator.get_storage_client()
358
+
359
+ if not event_manager:
360
+ event_manager = service_locator.get_event_manager()
361
+
362
+ self._service_locator = ServiceLocator(
363
+ configuration=configuration, storage_client=storage_client, event_manager=event_manager
364
+ )
355
365
 
356
- config = service_locator.get_configuration()
366
+ config = self._service_locator.get_configuration()
357
367
 
358
368
  # Core components
359
369
  self._request_manager = request_manager
@@ -419,6 +429,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
419
429
  httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
420
430
  httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
421
431
  self._logger = _logger or logging.getLogger(__name__)
432
+ if implicit_event_manager_with_explicit_config:
433
+ self._logger.warning(
434
+ 'No event manager set, implicitly using event manager from global service_locator.'
435
+ 'It is advised to explicitly set the event manager if explicit configuration is used as well.'
436
+ )
422
437
  self._statistics_log_format = statistics_log_format
423
438
 
424
439
  # Statistics
@@ -548,7 +563,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
548
563
  async def get_request_manager(self) -> RequestManager:
549
564
  """Return the configured request manager. If none is configured, open and return the default request queue."""
550
565
  if not self._request_manager:
551
- self._request_manager = await RequestQueue.open()
566
+ self._request_manager = await RequestQueue.open(
567
+ storage_client=self._service_locator.get_storage_client(),
568
+ configuration=self._service_locator.get_configuration(),
569
+ )
552
570
 
553
571
  return self._request_manager
554
572
 
@@ -557,18 +575,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
557
575
  *,
558
576
  id: str | None = None,
559
577
  name: str | None = None,
578
+ alias: str | None = None,
560
579
  ) -> Dataset:
561
580
  """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
562
- return await Dataset.open(id=id, name=name)
581
+ return await Dataset.open(
582
+ id=id,
583
+ name=name,
584
+ alias=alias,
585
+ storage_client=self._service_locator.get_storage_client(),
586
+ configuration=self._service_locator.get_configuration(),
587
+ )
563
588
 
564
589
  async def get_key_value_store(
565
590
  self,
566
591
  *,
567
592
  id: str | None = None,
568
593
  name: str | None = None,
594
+ alias: str | None = None,
569
595
  ) -> KeyValueStore:
570
596
  """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
571
- return await KeyValueStore.open(id=id, name=name)
597
+ return await KeyValueStore.open(
598
+ id=id,
599
+ name=name,
600
+ alias=alias,
601
+ storage_client=self._service_locator.get_storage_client(),
602
+ configuration=self._service_locator.get_configuration(),
603
+ )
572
604
 
573
605
  def error_handler(
574
606
  self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -627,7 +659,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
627
659
  request_manager = await self.get_request_manager()
628
660
  if purge_request_queue and isinstance(request_manager, RequestQueue):
629
661
  await request_manager.drop()
630
- self._request_manager = await RequestQueue.open()
662
+ self._request_manager = await RequestQueue.open(
663
+ storage_client=self._service_locator.get_storage_client(),
664
+ configuration=self._service_locator.get_configuration(),
665
+ )
631
666
 
632
667
  if requests is not None:
633
668
  await self.add_requests(requests)
@@ -684,7 +719,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
684
719
  return final_statistics
685
720
 
686
721
  async def _run_crawler(self) -> None:
687
- event_manager = service_locator.get_event_manager()
722
+ event_manager = self._service_locator.get_event_manager()
688
723
 
689
724
  self._crawler_state_rec_task.start()
690
725
 
@@ -772,6 +807,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
772
807
  self,
773
808
  dataset_id: str | None = None,
774
809
  dataset_name: str | None = None,
810
+ dataset_alias: str | None = None,
775
811
  **kwargs: Unpack[GetDataKwargs],
776
812
  ) -> DatasetItemsListPage:
777
813
  """Retrieve data from a `Dataset`.
@@ -781,13 +817,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
781
817
 
782
818
  Args:
783
819
  dataset_id: The ID of the `Dataset`.
784
- dataset_name: The name of the `Dataset`.
820
+ dataset_name: The name of the `Dataset` (global scope, named storage).
821
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
785
822
  kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
786
823
 
787
824
  Returns:
788
825
  The retrieved data.
789
826
  """
790
- dataset = await Dataset.open(id=dataset_id, name=dataset_name)
827
+ dataset = await Dataset.open(
828
+ id=dataset_id,
829
+ name=dataset_name,
830
+ alias=dataset_alias,
831
+ storage_client=self._service_locator.get_storage_client(),
832
+ configuration=self._service_locator.get_configuration(),
833
+ )
791
834
  return await dataset.get_data(**kwargs)
792
835
 
793
836
  async def export_data(
@@ -795,6 +838,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
795
838
  path: str | Path,
796
839
  dataset_id: str | None = None,
797
840
  dataset_name: str | None = None,
841
+ dataset_alias: str | None = None,
798
842
  ) -> None:
799
843
  """Export all items from a Dataset to a JSON or CSV file.
800
844
 
@@ -804,10 +848,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
804
848
 
805
849
  Args:
806
850
  path: The destination file path. Must end with '.json' or '.csv'.
807
- dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
808
- dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
851
+ dataset_id: The ID of the Dataset to export from.
852
+ dataset_name: The name of the Dataset to export from (global scope, named storage).
853
+ dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
809
854
  """
810
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
855
+ dataset = await Dataset.open(
856
+ id=dataset_id,
857
+ name=dataset_name,
858
+ alias=dataset_alias,
859
+ storage_client=self._service_locator.get_storage_client(),
860
+ configuration=self._service_locator.get_configuration(),
861
+ )
811
862
 
812
863
  path = path if isinstance(path, Path) else Path(path)
813
864
  dst = path.open('w', newline='')
@@ -824,6 +875,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
824
875
  data: list[dict[str, Any]] | dict[str, Any],
825
876
  dataset_id: str | None = None,
826
877
  dataset_name: str | None = None,
878
+ dataset_alias: str | None = None,
827
879
  **kwargs: Unpack[PushDataKwargs],
828
880
  ) -> None:
829
881
  """Push data to a `Dataset`.
@@ -834,10 +886,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
834
886
  Args:
835
887
  data: The data to push to the `Dataset`.
836
888
  dataset_id: The ID of the `Dataset`.
837
- dataset_name: The name of the `Dataset`.
889
+ dataset_name: The name of the `Dataset` (global scope, named storage).
890
+ dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
838
891
  kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
839
892
  """
840
- dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
893
+ dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
841
894
  await dataset.push_data(data, **kwargs)
842
895
 
843
896
  def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -894,6 +947,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
894
947
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
895
948
  | None = None,
896
949
  requests: Sequence[str | Request] | None = None,
950
+ rq_id: str | None = None,
951
+ rq_name: str | None = None,
952
+ rq_alias: str | None = None,
897
953
  **kwargs: Unpack[EnqueueLinksKwargs],
898
954
  ) -> None:
899
955
  kwargs.setdefault('strategy', 'same-hostname')
@@ -905,7 +961,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
905
961
  '`transform_request_function` arguments when `requests` is provided.'
906
962
  )
907
963
  # Add directly passed requests.
908
- await context.add_requests(requests or list[str | Request](), **kwargs)
964
+ await context.add_requests(
965
+ requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
966
+ )
909
967
  else:
910
968
  # Add requests from extracted links.
911
969
  await context.add_requests(
@@ -915,6 +973,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
915
973
  user_data=user_data,
916
974
  transform_request_function=transform_request_function,
917
975
  ),
976
+ rq_id=rq_id,
977
+ rq_name=rq_name,
978
+ rq_alias=rq_alias,
918
979
  **kwargs,
919
980
  )
920
981
 
@@ -1031,8 +1092,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1031
1092
 
1032
1093
  if self._should_retry_request(context, error):
1033
1094
  request.retry_count += 1
1095
+ reduced_error = str(error).split('\n')[0]
1034
1096
  self.log.warning(
1035
- f'Retrying request to {context.request.url} due to: {error} \n'
1097
+ f'Retrying request to {context.request.url} due to: {reduced_error}'
1036
1098
  f'{get_one_line_error_summary_if_possible(error)}'
1037
1099
  )
1038
1100
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1190,10 +1252,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1190
1252
  """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1191
1253
  result = self._context_result_map[context]
1192
1254
 
1193
- request_manager = await self.get_request_manager()
1255
+ base_request_manager = await self.get_request_manager()
1256
+
1194
1257
  origin = context.request.loaded_url or context.request.url
1195
1258
 
1196
1259
  for add_requests_call in result.add_requests_calls:
1260
+ rq_id = add_requests_call.get('rq_id')
1261
+ rq_name = add_requests_call.get('rq_name')
1262
+ rq_alias = add_requests_call.get('rq_alias')
1263
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1264
+ if specified_params > 1:
1265
+ raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1266
+ if rq_id or rq_name or rq_alias:
1267
+ request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1268
+ id=rq_id,
1269
+ name=rq_name,
1270
+ alias=rq_alias,
1271
+ storage_client=self._service_locator.get_storage_client(),
1272
+ configuration=self._service_locator.get_configuration(),
1273
+ )
1274
+ else:
1275
+ request_manager = base_request_manager
1276
+
1197
1277
  requests = list[Request]()
1198
1278
 
1199
1279
  base_url = url if (url := add_requests_call.get('base_url')) else origin
@@ -1225,8 +1305,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1225
1305
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
1226
1306
  ) -> None:
1227
1307
  """Store key value store changes recorded in result."""
1228
- for (id, name), changes in result.key_value_store_changes.items():
1229
- store = await get_kvs(id=id, name=name)
1308
+ for (id, name, alias), changes in result.key_value_store_changes.items():
1309
+ store = await get_kvs(id=id, name=name, alias=alias)
1230
1310
  for key, value in changes.updates.items():
1231
1311
  await store.set_value(key, value.content, value.content_type)
1232
1312
 
@@ -1519,7 +1599,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1519
1599
 
1520
1600
  async def _crawler_state_task(self) -> None:
1521
1601
  """Emit a persist state event with the given migration status."""
1522
- event_manager = service_locator.get_event_manager()
1602
+ event_manager = self._service_locator.get_event_manager()
1523
1603
 
1524
1604
  current_state = self.statistics.state
1525
1605
 
@@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
49
49
 
50
50
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
51
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
52
+ most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
53
+ elif 'playwright._impl._errors.Error' in str(error.__class__):
54
+ # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
+ # point to deep internals.
56
+ return ''
53
57
  else:
54
58
  traceback_parts = _get_traceback_parts_for_innermost_exception(error)
55
59
  # Commonly last traceback part is type of the error, and the second last part is the relevant file.
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
12
12
 
13
13
  from crawlee import service_locator
14
14
  from crawlee._request import Request, RequestOptions
15
+ from crawlee._types import ConcurrencySettings
15
16
  from crawlee._utils.blocked import RETRY_CSS_SELECTORS
16
17
  from crawlee._utils.docs import docs_group
17
18
  from crawlee._utils.robots import RobotsTxtFile
@@ -194,6 +195,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
194
195
 
195
196
  kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196
197
 
198
+ # Set default concurrency settings for browser crawlers if not provided
199
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
200
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
201
+
197
202
  super().__init__(**kwargs)
198
203
 
199
204
  async def _open_page(
@@ -509,9 +514,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
509
514
 
510
515
 
511
516
  class PlaywrightCrawlerOptions(
512
- Generic[TCrawlingContext, TStatisticsState],
513
517
  _PlaywrightCrawlerAdditionalOptions,
514
518
  BasicCrawlerOptions[TCrawlingContext, StatisticsState],
519
+ Generic[TCrawlingContext, TStatisticsState],
515
520
  ):
516
521
  """Arguments for the `AbstractHttpCrawler` constructor.
517
522
 
crawlee/events/_types.py CHANGED
@@ -40,7 +40,7 @@ class Event(str, Enum):
40
40
  class EventPersistStateData(BaseModel):
41
41
  """Data for the persist state event."""
42
42
 
43
- model_config = ConfigDict(populate_by_name=True)
43
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
44
44
 
45
45
  is_migrating: Annotated[bool, Field(alias='isMigrating')]
46
46
 
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
49
49
  class EventSystemInfoData(BaseModel):
50
50
  """Data for the system info event."""
51
51
 
52
- model_config = ConfigDict(populate_by_name=True)
52
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
53
53
 
54
54
  cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
55
55
  memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
62
62
  class EventMigratingData(BaseModel):
63
63
  """Data for the migrating event."""
64
64
 
65
- model_config = ConfigDict(populate_by_name=True)
65
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
66
66
 
67
67
  # The remaining time in seconds before the migration is forced and the process is killed
68
68
  # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
73
73
  class EventAbortingData(BaseModel):
74
74
  """Data for the aborting event."""
75
75
 
76
- model_config = ConfigDict(populate_by_name=True)
76
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
77
77
 
78
78
 
79
79
  @docs_group('Event data')
80
80
  class EventExitData(BaseModel):
81
81
  """Data for the exit event."""
82
82
 
83
- model_config = ConfigDict(populate_by_name=True)
83
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
84
84
 
85
85
 
86
86
  @docs_group('Event data')
87
87
  class EventCrawlerStatusData(BaseModel):
88
88
  """Data for the crawler status event."""
89
89
 
90
- model_config = ConfigDict(populate_by_name=True)
90
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
91
91
 
92
92
  message: str
93
93
  """A message describing the current status of the crawler."""
@@ -3,10 +3,13 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ from crawlee._utils.docs import docs_group
7
+
6
8
  if TYPE_CHECKING:
7
9
  from browserforge.fingerprints import Fingerprint
8
10
 
9
11
 
12
+ @docs_group('Other')
10
13
  class FingerprintGenerator(ABC):
11
14
  """A class for creating browser fingerprints that mimic browser fingerprints of real users."""
12
15
 
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
11
11
 
12
12
 
13
13
  class ScreenOptions(BaseModel):
14
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
14
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
15
15
 
16
16
  """Defines the screen constrains for the fingerprint generator."""
17
17
 
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
31
31
  class HeaderGeneratorOptions(BaseModel):
32
32
  """Collection of header related attributes that can be used by the fingerprint generator."""
33
33
 
34
- model_config = ConfigDict(extra='forbid', populate_by_name=True)
34
+ model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
35
35
 
36
36
  browsers: list[SupportedBrowserType] | None = None
37
37
  """List of BrowserSpecifications to generate the headers for."""
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
17
17
 
18
18
 
19
19
  class RequestListState(BaseModel):
20
- model_config = ConfigDict(populate_by_name=True)
20
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
21
21
 
22
22
  next_index: Annotated[int, Field(alias='nextIndex')] = 0
23
23
  next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
@@ -43,7 +43,11 @@ class RequestLoader(ABC):
43
43
 
44
44
  @abstractmethod
45
45
  async def fetch_next_request(self) -> Request | None:
46
- """Return the next request to be processed, or `null` if there are no more pending requests."""
46
+ """Return the next request to be processed, or `None` if there are no more pending requests.
47
+
48
+ The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
49
+ should wait until a request appears.
50
+ """
47
51
 
48
52
  @abstractmethod
49
53
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: