crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +2 -1
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +76 -17
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/sitemap.py +3 -1
- crawlee/_utils/system.py +3 -3
- crawlee/configuration.py +1 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +228 -48
- crawlee/sessions/_models.py +2 -2
- crawlee/statistics/_models.py +1 -1
- crawlee/storage_clients/__init__.py +12 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +9 -2
- crawlee/storages/_key_value_store.py +9 -2
- crawlee/storages/_request_queue.py +7 -2
- crawlee/storages/_storage_instance_manager.py +126 -72
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/METADATA +12 -5
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/RECORD +56 -46
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -27,6 +27,7 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
|
|
|
27
27
|
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
|
|
28
28
|
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
|
|
29
29
|
from crawlee._request import Request, RequestOptions, RequestState
|
|
30
|
+
from crawlee._service_locator import ServiceLocator
|
|
30
31
|
from crawlee._types import (
|
|
31
32
|
BasicCrawlingContext,
|
|
32
33
|
EnqueueLinksKwargs,
|
|
@@ -204,7 +205,7 @@ class _BasicCrawlerOptions(TypedDict):
|
|
|
204
205
|
Returning `None` suppresses the status message."""
|
|
205
206
|
|
|
206
207
|
|
|
207
|
-
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState]
|
|
208
|
+
class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
|
|
208
209
|
"""Generic options the `BasicCrawler` constructor."""
|
|
209
210
|
|
|
210
211
|
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
|
|
@@ -219,9 +220,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
|
|
|
219
220
|
|
|
220
221
|
|
|
221
222
|
class BasicCrawlerOptions(
|
|
222
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
223
223
|
_BasicCrawlerOptions,
|
|
224
224
|
_BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
|
|
225
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
225
226
|
):
|
|
226
227
|
"""Arguments for the `BasicCrawler` constructor.
|
|
227
228
|
|
|
@@ -346,14 +347,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
346
347
|
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
|
|
347
348
|
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
|
|
348
349
|
"""
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
350
|
+
implicit_event_manager_with_explicit_config = False
|
|
351
|
+
if not configuration:
|
|
352
|
+
configuration = service_locator.get_configuration()
|
|
353
|
+
elif not event_manager:
|
|
354
|
+
implicit_event_manager_with_explicit_config = True
|
|
355
|
+
|
|
356
|
+
if not storage_client:
|
|
357
|
+
storage_client = service_locator.get_storage_client()
|
|
358
|
+
|
|
359
|
+
if not event_manager:
|
|
360
|
+
event_manager = service_locator.get_event_manager()
|
|
361
|
+
|
|
362
|
+
self._service_locator = ServiceLocator(
|
|
363
|
+
configuration=configuration, storage_client=storage_client, event_manager=event_manager
|
|
364
|
+
)
|
|
355
365
|
|
|
356
|
-
config =
|
|
366
|
+
config = self._service_locator.get_configuration()
|
|
357
367
|
|
|
358
368
|
# Core components
|
|
359
369
|
self._request_manager = request_manager
|
|
@@ -419,6 +429,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
419
429
|
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
|
|
420
430
|
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
|
|
421
431
|
self._logger = _logger or logging.getLogger(__name__)
|
|
432
|
+
if implicit_event_manager_with_explicit_config:
|
|
433
|
+
self._logger.warning(
|
|
434
|
+
'No event manager set, implicitly using event manager from global service_locator.'
|
|
435
|
+
'It is advised to explicitly set the event manager if explicit configuration is used as well.'
|
|
436
|
+
)
|
|
422
437
|
self._statistics_log_format = statistics_log_format
|
|
423
438
|
|
|
424
439
|
# Statistics
|
|
@@ -548,7 +563,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
548
563
|
async def get_request_manager(self) -> RequestManager:
|
|
549
564
|
"""Return the configured request manager. If none is configured, open and return the default request queue."""
|
|
550
565
|
if not self._request_manager:
|
|
551
|
-
self._request_manager = await RequestQueue.open(
|
|
566
|
+
self._request_manager = await RequestQueue.open(
|
|
567
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
568
|
+
configuration=self._service_locator.get_configuration(),
|
|
569
|
+
)
|
|
552
570
|
|
|
553
571
|
return self._request_manager
|
|
554
572
|
|
|
@@ -557,18 +575,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
557
575
|
*,
|
|
558
576
|
id: str | None = None,
|
|
559
577
|
name: str | None = None,
|
|
578
|
+
alias: str | None = None,
|
|
560
579
|
) -> Dataset:
|
|
561
580
|
"""Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
|
|
562
|
-
return await Dataset.open(
|
|
581
|
+
return await Dataset.open(
|
|
582
|
+
id=id,
|
|
583
|
+
name=name,
|
|
584
|
+
alias=alias,
|
|
585
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
586
|
+
configuration=self._service_locator.get_configuration(),
|
|
587
|
+
)
|
|
563
588
|
|
|
564
589
|
async def get_key_value_store(
|
|
565
590
|
self,
|
|
566
591
|
*,
|
|
567
592
|
id: str | None = None,
|
|
568
593
|
name: str | None = None,
|
|
594
|
+
alias: str | None = None,
|
|
569
595
|
) -> KeyValueStore:
|
|
570
596
|
"""Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
|
|
571
|
-
return await KeyValueStore.open(
|
|
597
|
+
return await KeyValueStore.open(
|
|
598
|
+
id=id,
|
|
599
|
+
name=name,
|
|
600
|
+
alias=alias,
|
|
601
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
602
|
+
configuration=self._service_locator.get_configuration(),
|
|
603
|
+
)
|
|
572
604
|
|
|
573
605
|
def error_handler(
|
|
574
606
|
self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
|
|
@@ -627,7 +659,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
627
659
|
request_manager = await self.get_request_manager()
|
|
628
660
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
629
661
|
await request_manager.drop()
|
|
630
|
-
self._request_manager = await RequestQueue.open(
|
|
662
|
+
self._request_manager = await RequestQueue.open(
|
|
663
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
664
|
+
configuration=self._service_locator.get_configuration(),
|
|
665
|
+
)
|
|
631
666
|
|
|
632
667
|
if requests is not None:
|
|
633
668
|
await self.add_requests(requests)
|
|
@@ -684,7 +719,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
684
719
|
return final_statistics
|
|
685
720
|
|
|
686
721
|
async def _run_crawler(self) -> None:
|
|
687
|
-
event_manager =
|
|
722
|
+
event_manager = self._service_locator.get_event_manager()
|
|
688
723
|
|
|
689
724
|
self._crawler_state_rec_task.start()
|
|
690
725
|
|
|
@@ -772,6 +807,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
772
807
|
self,
|
|
773
808
|
dataset_id: str | None = None,
|
|
774
809
|
dataset_name: str | None = None,
|
|
810
|
+
dataset_alias: str | None = None,
|
|
775
811
|
**kwargs: Unpack[GetDataKwargs],
|
|
776
812
|
) -> DatasetItemsListPage:
|
|
777
813
|
"""Retrieve data from a `Dataset`.
|
|
@@ -781,13 +817,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
781
817
|
|
|
782
818
|
Args:
|
|
783
819
|
dataset_id: The ID of the `Dataset`.
|
|
784
|
-
dataset_name: The name of the `Dataset
|
|
820
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
821
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
785
822
|
kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
|
|
786
823
|
|
|
787
824
|
Returns:
|
|
788
825
|
The retrieved data.
|
|
789
826
|
"""
|
|
790
|
-
dataset = await Dataset.open(
|
|
827
|
+
dataset = await Dataset.open(
|
|
828
|
+
id=dataset_id,
|
|
829
|
+
name=dataset_name,
|
|
830
|
+
alias=dataset_alias,
|
|
831
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
832
|
+
configuration=self._service_locator.get_configuration(),
|
|
833
|
+
)
|
|
791
834
|
return await dataset.get_data(**kwargs)
|
|
792
835
|
|
|
793
836
|
async def export_data(
|
|
@@ -795,6 +838,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
795
838
|
path: str | Path,
|
|
796
839
|
dataset_id: str | None = None,
|
|
797
840
|
dataset_name: str | None = None,
|
|
841
|
+
dataset_alias: str | None = None,
|
|
798
842
|
) -> None:
|
|
799
843
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
800
844
|
|
|
@@ -804,10 +848,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
804
848
|
|
|
805
849
|
Args:
|
|
806
850
|
path: The destination file path. Must end with '.json' or '.csv'.
|
|
807
|
-
dataset_id: The ID of the Dataset to export from.
|
|
808
|
-
dataset_name: The name of the Dataset to export from
|
|
851
|
+
dataset_id: The ID of the Dataset to export from.
|
|
852
|
+
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
853
|
+
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
809
854
|
"""
|
|
810
|
-
dataset = await
|
|
855
|
+
dataset = await Dataset.open(
|
|
856
|
+
id=dataset_id,
|
|
857
|
+
name=dataset_name,
|
|
858
|
+
alias=dataset_alias,
|
|
859
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
860
|
+
configuration=self._service_locator.get_configuration(),
|
|
861
|
+
)
|
|
811
862
|
|
|
812
863
|
path = path if isinstance(path, Path) else Path(path)
|
|
813
864
|
dst = path.open('w', newline='')
|
|
@@ -824,6 +875,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
824
875
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
825
876
|
dataset_id: str | None = None,
|
|
826
877
|
dataset_name: str | None = None,
|
|
878
|
+
dataset_alias: str | None = None,
|
|
827
879
|
**kwargs: Unpack[PushDataKwargs],
|
|
828
880
|
) -> None:
|
|
829
881
|
"""Push data to a `Dataset`.
|
|
@@ -834,10 +886,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
834
886
|
Args:
|
|
835
887
|
data: The data to push to the `Dataset`.
|
|
836
888
|
dataset_id: The ID of the `Dataset`.
|
|
837
|
-
dataset_name: The name of the `Dataset
|
|
889
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
890
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
838
891
|
kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
|
|
839
892
|
"""
|
|
840
|
-
dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
|
|
893
|
+
dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
|
|
841
894
|
await dataset.push_data(data, **kwargs)
|
|
842
895
|
|
|
843
896
|
def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
|
|
@@ -894,6 +947,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
894
947
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
895
948
|
| None = None,
|
|
896
949
|
requests: Sequence[str | Request] | None = None,
|
|
950
|
+
rq_id: str | None = None,
|
|
951
|
+
rq_name: str | None = None,
|
|
952
|
+
rq_alias: str | None = None,
|
|
897
953
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
898
954
|
) -> None:
|
|
899
955
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -905,7 +961,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
905
961
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
906
962
|
)
|
|
907
963
|
# Add directly passed requests.
|
|
908
|
-
await context.add_requests(
|
|
964
|
+
await context.add_requests(
|
|
965
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
966
|
+
)
|
|
909
967
|
else:
|
|
910
968
|
# Add requests from extracted links.
|
|
911
969
|
await context.add_requests(
|
|
@@ -915,6 +973,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
915
973
|
user_data=user_data,
|
|
916
974
|
transform_request_function=transform_request_function,
|
|
917
975
|
),
|
|
976
|
+
rq_id=rq_id,
|
|
977
|
+
rq_name=rq_name,
|
|
978
|
+
rq_alias=rq_alias,
|
|
918
979
|
**kwargs,
|
|
919
980
|
)
|
|
920
981
|
|
|
@@ -1031,8 +1092,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1031
1092
|
|
|
1032
1093
|
if self._should_retry_request(context, error):
|
|
1033
1094
|
request.retry_count += 1
|
|
1095
|
+
reduced_error = str(error).split('\n')[0]
|
|
1034
1096
|
self.log.warning(
|
|
1035
|
-
f'Retrying request to {context.request.url} due to: {
|
|
1097
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1036
1098
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1037
1099
|
)
|
|
1038
1100
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1190,10 +1252,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1190
1252
|
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1191
1253
|
result = self._context_result_map[context]
|
|
1192
1254
|
|
|
1193
|
-
|
|
1255
|
+
base_request_manager = await self.get_request_manager()
|
|
1256
|
+
|
|
1194
1257
|
origin = context.request.loaded_url or context.request.url
|
|
1195
1258
|
|
|
1196
1259
|
for add_requests_call in result.add_requests_calls:
|
|
1260
|
+
rq_id = add_requests_call.get('rq_id')
|
|
1261
|
+
rq_name = add_requests_call.get('rq_name')
|
|
1262
|
+
rq_alias = add_requests_call.get('rq_alias')
|
|
1263
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
1264
|
+
if specified_params > 1:
|
|
1265
|
+
raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
|
|
1266
|
+
if rq_id or rq_name or rq_alias:
|
|
1267
|
+
request_manager: RequestManager | RequestQueue = await RequestQueue.open(
|
|
1268
|
+
id=rq_id,
|
|
1269
|
+
name=rq_name,
|
|
1270
|
+
alias=rq_alias,
|
|
1271
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1272
|
+
configuration=self._service_locator.get_configuration(),
|
|
1273
|
+
)
|
|
1274
|
+
else:
|
|
1275
|
+
request_manager = base_request_manager
|
|
1276
|
+
|
|
1197
1277
|
requests = list[Request]()
|
|
1198
1278
|
|
|
1199
1279
|
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
@@ -1225,8 +1305,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1225
1305
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
1226
1306
|
) -> None:
|
|
1227
1307
|
"""Store key value store changes recorded in result."""
|
|
1228
|
-
for (id, name), changes in result.key_value_store_changes.items():
|
|
1229
|
-
store = await get_kvs(id=id, name=name)
|
|
1308
|
+
for (id, name, alias), changes in result.key_value_store_changes.items():
|
|
1309
|
+
store = await get_kvs(id=id, name=name, alias=alias)
|
|
1230
1310
|
for key, value in changes.updates.items():
|
|
1231
1311
|
await store.set_value(key, value.content, value.content_type)
|
|
1232
1312
|
|
|
@@ -1519,7 +1599,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1519
1599
|
|
|
1520
1600
|
async def _crawler_state_task(self) -> None:
|
|
1521
1601
|
"""Emit a persist state event with the given migration status."""
|
|
1522
|
-
event_manager =
|
|
1602
|
+
event_manager = self._service_locator.get_event_manager()
|
|
1523
1603
|
|
|
1524
1604
|
current_state = self.statistics.state
|
|
1525
1605
|
|
|
@@ -49,7 +49,11 @@ def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
|
49
49
|
|
|
50
50
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
51
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
|
|
52
|
+
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
|
|
53
|
+
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
|
+
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
|
+
# point to deep internals.
|
|
56
|
+
return ''
|
|
53
57
|
else:
|
|
54
58
|
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
55
59
|
# Commonly last traceback part is type of the error, and the second last part is the relevant file.
|
|
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
|
12
12
|
|
|
13
13
|
from crawlee import service_locator
|
|
14
14
|
from crawlee._request import Request, RequestOptions
|
|
15
|
+
from crawlee._types import ConcurrencySettings
|
|
15
16
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
17
|
from crawlee._utils.docs import docs_group
|
|
17
18
|
from crawlee._utils.robots import RobotsTxtFile
|
|
@@ -194,6 +195,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
195
|
|
|
195
196
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
197
|
|
|
198
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
199
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
200
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
201
|
+
|
|
197
202
|
super().__init__(**kwargs)
|
|
198
203
|
|
|
199
204
|
async def _open_page(
|
|
@@ -509,9 +514,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
509
514
|
|
|
510
515
|
|
|
511
516
|
class PlaywrightCrawlerOptions(
|
|
512
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
513
517
|
_PlaywrightCrawlerAdditionalOptions,
|
|
514
518
|
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
|
|
519
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
515
520
|
):
|
|
516
521
|
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
517
522
|
|
crawlee/events/_types.py
CHANGED
|
@@ -40,7 +40,7 @@ class Event(str, Enum):
|
|
|
40
40
|
class EventPersistStateData(BaseModel):
|
|
41
41
|
"""Data for the persist state event."""
|
|
42
42
|
|
|
43
|
-
model_config = ConfigDict(
|
|
43
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
44
44
|
|
|
45
45
|
is_migrating: Annotated[bool, Field(alias='isMigrating')]
|
|
46
46
|
|
|
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
|
|
|
49
49
|
class EventSystemInfoData(BaseModel):
|
|
50
50
|
"""Data for the system info event."""
|
|
51
51
|
|
|
52
|
-
model_config = ConfigDict(
|
|
52
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
53
53
|
|
|
54
54
|
cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
|
|
55
55
|
memory_info: Annotated[
|
|
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
|
|
|
62
62
|
class EventMigratingData(BaseModel):
|
|
63
63
|
"""Data for the migrating event."""
|
|
64
64
|
|
|
65
|
-
model_config = ConfigDict(
|
|
65
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
66
66
|
|
|
67
67
|
# The remaining time in seconds before the migration is forced and the process is killed
|
|
68
68
|
# Optional because it's not present when the event handler is called manually
|
|
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
|
|
|
73
73
|
class EventAbortingData(BaseModel):
|
|
74
74
|
"""Data for the aborting event."""
|
|
75
75
|
|
|
76
|
-
model_config = ConfigDict(
|
|
76
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
@docs_group('Event data')
|
|
80
80
|
class EventExitData(BaseModel):
|
|
81
81
|
"""Data for the exit event."""
|
|
82
82
|
|
|
83
|
-
model_config = ConfigDict(
|
|
83
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
@docs_group('Event data')
|
|
87
87
|
class EventCrawlerStatusData(BaseModel):
|
|
88
88
|
"""Data for the crawler status event."""
|
|
89
89
|
|
|
90
|
-
model_config = ConfigDict(
|
|
90
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
91
91
|
|
|
92
92
|
message: str
|
|
93
93
|
"""A message describing the current status of the crawler."""
|
|
@@ -3,10 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
|
+
from crawlee._utils.docs import docs_group
|
|
7
|
+
|
|
6
8
|
if TYPE_CHECKING:
|
|
7
9
|
from browserforge.fingerprints import Fingerprint
|
|
8
10
|
|
|
9
11
|
|
|
12
|
+
@docs_group('Other')
|
|
10
13
|
class FingerprintGenerator(ABC):
|
|
11
14
|
"""A class for creating browser fingerprints that mimic browser fingerprints of real users."""
|
|
12
15
|
|
|
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ScreenOptions(BaseModel):
|
|
14
|
-
model_config = ConfigDict(extra='forbid',
|
|
14
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
15
15
|
|
|
16
16
|
"""Defines the screen constrains for the fingerprint generator."""
|
|
17
17
|
|
|
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
|
|
|
31
31
|
class HeaderGeneratorOptions(BaseModel):
|
|
32
32
|
"""Collection of header related attributes that can be used by the fingerprint generator."""
|
|
33
33
|
|
|
34
|
-
model_config = ConfigDict(extra='forbid',
|
|
34
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
35
35
|
|
|
36
36
|
browsers: list[SupportedBrowserType] | None = None
|
|
37
37
|
"""List of BrowserSpecifications to generate the headers for."""
|
|
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class RequestListState(BaseModel):
|
|
20
|
-
model_config = ConfigDict(
|
|
20
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
21
21
|
|
|
22
22
|
next_index: Annotated[int, Field(alias='nextIndex')] = 0
|
|
23
23
|
next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
|
|
@@ -43,7 +43,11 @@ class RequestLoader(ABC):
|
|
|
43
43
|
|
|
44
44
|
@abstractmethod
|
|
45
45
|
async def fetch_next_request(self) -> Request | None:
|
|
46
|
-
"""Return the next request to be processed, or `
|
|
46
|
+
"""Return the next request to be processed, or `None` if there are no more pending requests.
|
|
47
|
+
|
|
48
|
+
The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
|
|
49
|
+
should wait until a request appears.
|
|
50
|
+
"""
|
|
47
51
|
|
|
48
52
|
@abstractmethod
|
|
49
53
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|