crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +34 -22
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +86 -33
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +124 -37
- crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
- crawlee/events/_event_manager.py +3 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +33 -2
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
- crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +10 -2
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -27,6 +27,7 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
|
|
|
27
27
|
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
|
|
28
28
|
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
|
|
29
29
|
from crawlee._request import Request, RequestOptions, RequestState
|
|
30
|
+
from crawlee._service_locator import ServiceLocator
|
|
30
31
|
from crawlee._types import (
|
|
31
32
|
BasicCrawlingContext,
|
|
32
33
|
EnqueueLinksKwargs,
|
|
@@ -204,7 +205,7 @@ class _BasicCrawlerOptions(TypedDict):
|
|
|
204
205
|
Returning `None` suppresses the status message."""
|
|
205
206
|
|
|
206
207
|
|
|
207
|
-
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState]
|
|
208
|
+
class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
|
|
208
209
|
"""Generic options the `BasicCrawler` constructor."""
|
|
209
210
|
|
|
210
211
|
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
|
|
@@ -219,9 +220,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
|
|
|
219
220
|
|
|
220
221
|
|
|
221
222
|
class BasicCrawlerOptions(
|
|
222
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
223
223
|
_BasicCrawlerOptions,
|
|
224
224
|
_BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
|
|
225
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
225
226
|
):
|
|
226
227
|
"""Arguments for the `BasicCrawler` constructor.
|
|
227
228
|
|
|
@@ -346,14 +347,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
346
347
|
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
|
|
347
348
|
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
|
|
348
349
|
"""
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
service_locator.set_event_manager(event_manager)
|
|
350
|
+
implicit_event_manager_with_explicit_config = False
|
|
351
|
+
if not configuration:
|
|
352
|
+
configuration = service_locator.get_configuration()
|
|
353
|
+
elif not event_manager:
|
|
354
|
+
implicit_event_manager_with_explicit_config = True
|
|
355
355
|
|
|
356
|
-
|
|
356
|
+
if not storage_client:
|
|
357
|
+
storage_client = service_locator.get_storage_client()
|
|
358
|
+
|
|
359
|
+
if not event_manager:
|
|
360
|
+
event_manager = service_locator.get_event_manager()
|
|
361
|
+
|
|
362
|
+
self._service_locator = ServiceLocator(
|
|
363
|
+
configuration=configuration, storage_client=storage_client, event_manager=event_manager
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
config = self._service_locator.get_configuration()
|
|
357
367
|
|
|
358
368
|
# Core components
|
|
359
369
|
self._request_manager = request_manager
|
|
@@ -419,17 +429,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
419
429
|
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
|
|
420
430
|
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
|
|
421
431
|
self._logger = _logger or logging.getLogger(__name__)
|
|
432
|
+
if implicit_event_manager_with_explicit_config:
|
|
433
|
+
self._logger.warning(
|
|
434
|
+
'No event manager set, implicitly using event manager from global service_locator.'
|
|
435
|
+
'It is advised to explicitly set the event manager if explicit configuration is used as well.'
|
|
436
|
+
)
|
|
422
437
|
self._statistics_log_format = statistics_log_format
|
|
423
438
|
|
|
424
439
|
# Statistics
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
440
|
+
if statistics:
|
|
441
|
+
self._statistics = statistics
|
|
442
|
+
else:
|
|
443
|
+
|
|
444
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
445
|
+
return await self.get_key_value_store()
|
|
446
|
+
|
|
447
|
+
self._statistics = cast(
|
|
448
|
+
'Statistics[TStatisticsState]',
|
|
449
|
+
Statistics.with_default_state(
|
|
450
|
+
persistence_enabled=True,
|
|
451
|
+
periodic_message_logger=self._logger,
|
|
452
|
+
statistics_log_format=self._statistics_log_format,
|
|
453
|
+
log_message='Current request statistics:',
|
|
454
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
455
|
+
),
|
|
456
|
+
)
|
|
433
457
|
|
|
434
458
|
# Additional context managers to enter and exit
|
|
435
459
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -548,7 +572,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
548
572
|
async def get_request_manager(self) -> RequestManager:
|
|
549
573
|
"""Return the configured request manager. If none is configured, open and return the default request queue."""
|
|
550
574
|
if not self._request_manager:
|
|
551
|
-
self._request_manager = await RequestQueue.open(
|
|
575
|
+
self._request_manager = await RequestQueue.open(
|
|
576
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
577
|
+
configuration=self._service_locator.get_configuration(),
|
|
578
|
+
)
|
|
552
579
|
|
|
553
580
|
return self._request_manager
|
|
554
581
|
|
|
@@ -557,18 +584,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
557
584
|
*,
|
|
558
585
|
id: str | None = None,
|
|
559
586
|
name: str | None = None,
|
|
587
|
+
alias: str | None = None,
|
|
560
588
|
) -> Dataset:
|
|
561
589
|
"""Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
|
|
562
|
-
return await Dataset.open(
|
|
590
|
+
return await Dataset.open(
|
|
591
|
+
id=id,
|
|
592
|
+
name=name,
|
|
593
|
+
alias=alias,
|
|
594
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
595
|
+
configuration=self._service_locator.get_configuration(),
|
|
596
|
+
)
|
|
563
597
|
|
|
564
598
|
async def get_key_value_store(
|
|
565
599
|
self,
|
|
566
600
|
*,
|
|
567
601
|
id: str | None = None,
|
|
568
602
|
name: str | None = None,
|
|
603
|
+
alias: str | None = None,
|
|
569
604
|
) -> KeyValueStore:
|
|
570
605
|
"""Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
|
|
571
|
-
return await KeyValueStore.open(
|
|
606
|
+
return await KeyValueStore.open(
|
|
607
|
+
id=id,
|
|
608
|
+
name=name,
|
|
609
|
+
alias=alias,
|
|
610
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
611
|
+
configuration=self._service_locator.get_configuration(),
|
|
612
|
+
)
|
|
572
613
|
|
|
573
614
|
def error_handler(
|
|
574
615
|
self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
|
|
@@ -627,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
627
668
|
request_manager = await self.get_request_manager()
|
|
628
669
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
629
670
|
await request_manager.drop()
|
|
630
|
-
self._request_manager = await RequestQueue.open(
|
|
671
|
+
self._request_manager = await RequestQueue.open(
|
|
672
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
673
|
+
configuration=self._service_locator.get_configuration(),
|
|
674
|
+
)
|
|
631
675
|
|
|
632
676
|
if requests is not None:
|
|
633
677
|
await self.add_requests(requests)
|
|
@@ -654,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
654
698
|
except CancelledError:
|
|
655
699
|
pass
|
|
656
700
|
finally:
|
|
657
|
-
await self._crawler_state_rec_task.stop()
|
|
658
701
|
if threading.current_thread() is threading.main_thread():
|
|
659
702
|
with suppress(NotImplementedError):
|
|
660
703
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -684,9 +727,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
684
727
|
return final_statistics
|
|
685
728
|
|
|
686
729
|
async def _run_crawler(self) -> None:
|
|
687
|
-
event_manager =
|
|
688
|
-
|
|
689
|
-
self._crawler_state_rec_task.start()
|
|
730
|
+
event_manager = self._service_locator.get_event_manager()
|
|
690
731
|
|
|
691
732
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
692
733
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
@@ -698,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
698
739
|
self._statistics,
|
|
699
740
|
self._session_pool if self._use_session_pool else None,
|
|
700
741
|
self._http_client,
|
|
742
|
+
self._crawler_state_rec_task,
|
|
701
743
|
*self._additional_context_managers,
|
|
702
744
|
)
|
|
703
745
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -772,6 +814,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
772
814
|
self,
|
|
773
815
|
dataset_id: str | None = None,
|
|
774
816
|
dataset_name: str | None = None,
|
|
817
|
+
dataset_alias: str | None = None,
|
|
775
818
|
**kwargs: Unpack[GetDataKwargs],
|
|
776
819
|
) -> DatasetItemsListPage:
|
|
777
820
|
"""Retrieve data from a `Dataset`.
|
|
@@ -781,13 +824,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
781
824
|
|
|
782
825
|
Args:
|
|
783
826
|
dataset_id: The ID of the `Dataset`.
|
|
784
|
-
dataset_name: The name of the `Dataset
|
|
827
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
828
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
785
829
|
kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
|
|
786
830
|
|
|
787
831
|
Returns:
|
|
788
832
|
The retrieved data.
|
|
789
833
|
"""
|
|
790
|
-
dataset = await Dataset.open(
|
|
834
|
+
dataset = await Dataset.open(
|
|
835
|
+
id=dataset_id,
|
|
836
|
+
name=dataset_name,
|
|
837
|
+
alias=dataset_alias,
|
|
838
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
839
|
+
configuration=self._service_locator.get_configuration(),
|
|
840
|
+
)
|
|
791
841
|
return await dataset.get_data(**kwargs)
|
|
792
842
|
|
|
793
843
|
async def export_data(
|
|
@@ -795,6 +845,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
795
845
|
path: str | Path,
|
|
796
846
|
dataset_id: str | None = None,
|
|
797
847
|
dataset_name: str | None = None,
|
|
848
|
+
dataset_alias: str | None = None,
|
|
798
849
|
) -> None:
|
|
799
850
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
800
851
|
|
|
@@ -804,10 +855,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
804
855
|
|
|
805
856
|
Args:
|
|
806
857
|
path: The destination file path. Must end with '.json' or '.csv'.
|
|
807
|
-
dataset_id: The ID of the Dataset to export from.
|
|
808
|
-
dataset_name: The name of the Dataset to export from
|
|
858
|
+
dataset_id: The ID of the Dataset to export from.
|
|
859
|
+
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
860
|
+
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
809
861
|
"""
|
|
810
|
-
dataset = await
|
|
862
|
+
dataset = await Dataset.open(
|
|
863
|
+
id=dataset_id,
|
|
864
|
+
name=dataset_name,
|
|
865
|
+
alias=dataset_alias,
|
|
866
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
867
|
+
configuration=self._service_locator.get_configuration(),
|
|
868
|
+
)
|
|
811
869
|
|
|
812
870
|
path = path if isinstance(path, Path) else Path(path)
|
|
813
871
|
dst = path.open('w', newline='')
|
|
@@ -824,6 +882,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
824
882
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
825
883
|
dataset_id: str | None = None,
|
|
826
884
|
dataset_name: str | None = None,
|
|
885
|
+
dataset_alias: str | None = None,
|
|
827
886
|
**kwargs: Unpack[PushDataKwargs],
|
|
828
887
|
) -> None:
|
|
829
888
|
"""Push data to a `Dataset`.
|
|
@@ -834,10 +893,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
834
893
|
Args:
|
|
835
894
|
data: The data to push to the `Dataset`.
|
|
836
895
|
dataset_id: The ID of the `Dataset`.
|
|
837
|
-
dataset_name: The name of the `Dataset
|
|
896
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
897
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
838
898
|
kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
|
|
839
899
|
"""
|
|
840
|
-
dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
|
|
900
|
+
dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
|
|
841
901
|
await dataset.push_data(data, **kwargs)
|
|
842
902
|
|
|
843
903
|
def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
|
|
@@ -894,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
894
954
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
895
955
|
| None = None,
|
|
896
956
|
requests: Sequence[str | Request] | None = None,
|
|
957
|
+
rq_id: str | None = None,
|
|
958
|
+
rq_name: str | None = None,
|
|
959
|
+
rq_alias: str | None = None,
|
|
897
960
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
898
961
|
) -> None:
|
|
899
962
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -905,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
905
968
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
906
969
|
)
|
|
907
970
|
# Add directly passed requests.
|
|
908
|
-
await context.add_requests(
|
|
971
|
+
await context.add_requests(
|
|
972
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
973
|
+
)
|
|
909
974
|
else:
|
|
910
975
|
# Add requests from extracted links.
|
|
911
976
|
await context.add_requests(
|
|
@@ -914,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
914
979
|
label=label,
|
|
915
980
|
user_data=user_data,
|
|
916
981
|
transform_request_function=transform_request_function,
|
|
982
|
+
**kwargs,
|
|
917
983
|
),
|
|
984
|
+
rq_id=rq_id,
|
|
985
|
+
rq_name=rq_name,
|
|
986
|
+
rq_alias=rq_alias,
|
|
918
987
|
**kwargs,
|
|
919
988
|
)
|
|
920
989
|
|
|
@@ -1191,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1191
1260
|
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1192
1261
|
result = self._context_result_map[context]
|
|
1193
1262
|
|
|
1194
|
-
|
|
1263
|
+
base_request_manager = await self.get_request_manager()
|
|
1264
|
+
|
|
1195
1265
|
origin = context.request.loaded_url or context.request.url
|
|
1196
1266
|
|
|
1197
1267
|
for add_requests_call in result.add_requests_calls:
|
|
1268
|
+
rq_id = add_requests_call.get('rq_id')
|
|
1269
|
+
rq_name = add_requests_call.get('rq_name')
|
|
1270
|
+
rq_alias = add_requests_call.get('rq_alias')
|
|
1271
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
1272
|
+
if specified_params > 1:
|
|
1273
|
+
raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
|
|
1274
|
+
if rq_id or rq_name or rq_alias:
|
|
1275
|
+
request_manager: RequestManager | RequestQueue = await RequestQueue.open(
|
|
1276
|
+
id=rq_id,
|
|
1277
|
+
name=rq_name,
|
|
1278
|
+
alias=rq_alias,
|
|
1279
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
+
configuration=self._service_locator.get_configuration(),
|
|
1281
|
+
)
|
|
1282
|
+
else:
|
|
1283
|
+
request_manager = base_request_manager
|
|
1284
|
+
|
|
1198
1285
|
requests = list[Request]()
|
|
1199
1286
|
|
|
1200
1287
|
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
@@ -1226,8 +1313,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1226
1313
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
1227
1314
|
) -> None:
|
|
1228
1315
|
"""Store key value store changes recorded in result."""
|
|
1229
|
-
for (id, name), changes in result.key_value_store_changes.items():
|
|
1230
|
-
store = await get_kvs(id=id, name=name)
|
|
1316
|
+
for (id, name, alias), changes in result.key_value_store_changes.items():
|
|
1317
|
+
store = await get_kvs(id=id, name=name, alias=alias)
|
|
1231
1318
|
for key, value in changes.updates.items():
|
|
1232
1319
|
await store.set_value(key, value.content, value.content_type)
|
|
1233
1320
|
|
|
@@ -1520,7 +1607,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1520
1607
|
|
|
1521
1608
|
async def _crawler_state_task(self) -> None:
|
|
1522
1609
|
"""Emit a persist state event with the given migration status."""
|
|
1523
|
-
event_manager =
|
|
1610
|
+
event_manager = self._service_locator.get_event_manager()
|
|
1524
1611
|
|
|
1525
1612
|
current_state = self.statistics.state
|
|
1526
1613
|
|
|
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
|
12
12
|
|
|
13
13
|
from crawlee import service_locator
|
|
14
14
|
from crawlee._request import Request, RequestOptions
|
|
15
|
+
from crawlee._types import ConcurrencySettings
|
|
15
16
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
17
|
from crawlee._utils.docs import docs_group
|
|
17
18
|
from crawlee._utils.robots import RobotsTxtFile
|
|
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
113
114
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
114
115
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
115
116
|
and local storage.
|
|
116
|
-
browser_type: The type of browser to launch
|
|
117
|
+
browser_type: The type of browser to launch:
|
|
118
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
119
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
120
|
+
the system.
|
|
117
121
|
This option should not be used if `browser_pool` is provided.
|
|
118
122
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
119
123
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
152
156
|
):
|
|
153
157
|
raise ValueError(
|
|
154
158
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
155
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
159
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
156
160
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
157
161
|
)
|
|
158
162
|
|
|
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
198
|
|
|
195
199
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
200
|
|
|
201
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
202
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
203
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
204
|
+
|
|
197
205
|
super().__init__(**kwargs)
|
|
198
206
|
|
|
199
207
|
async def _open_page(
|
|
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
361
369
|
links_iterator: Iterator[str] = iter(
|
|
362
370
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
363
371
|
)
|
|
364
|
-
links_iterator = to_absolute_url_iterator(
|
|
372
|
+
links_iterator = to_absolute_url_iterator(
|
|
373
|
+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
|
|
374
|
+
)
|
|
365
375
|
|
|
366
376
|
if robots_txt_file:
|
|
367
377
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
489
499
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
490
500
|
|
|
491
501
|
browser_type: NotRequired[BrowserType]
|
|
492
|
-
"""The type of browser to launch
|
|
502
|
+
"""The type of browser to launch:
|
|
503
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
504
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
493
505
|
This option should not be used if `browser_pool` is provided."""
|
|
494
506
|
|
|
495
507
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -509,9 +521,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
509
521
|
|
|
510
522
|
|
|
511
523
|
class PlaywrightCrawlerOptions(
|
|
512
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
513
524
|
_PlaywrightCrawlerAdditionalOptions,
|
|
514
525
|
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
|
|
526
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
515
527
|
):
|
|
516
528
|
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
517
529
|
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
crawlee/events/_types.py
CHANGED
|
@@ -40,7 +40,7 @@ class Event(str, Enum):
|
|
|
40
40
|
class EventPersistStateData(BaseModel):
|
|
41
41
|
"""Data for the persist state event."""
|
|
42
42
|
|
|
43
|
-
model_config = ConfigDict(
|
|
43
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
44
44
|
|
|
45
45
|
is_migrating: Annotated[bool, Field(alias='isMigrating')]
|
|
46
46
|
|
|
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
|
|
|
49
49
|
class EventSystemInfoData(BaseModel):
|
|
50
50
|
"""Data for the system info event."""
|
|
51
51
|
|
|
52
|
-
model_config = ConfigDict(
|
|
52
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
53
53
|
|
|
54
54
|
cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
|
|
55
55
|
memory_info: Annotated[
|
|
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
|
|
|
62
62
|
class EventMigratingData(BaseModel):
|
|
63
63
|
"""Data for the migrating event."""
|
|
64
64
|
|
|
65
|
-
model_config = ConfigDict(
|
|
65
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
66
66
|
|
|
67
67
|
# The remaining time in seconds before the migration is forced and the process is killed
|
|
68
68
|
# Optional because it's not present when the event handler is called manually
|
|
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
|
|
|
73
73
|
class EventAbortingData(BaseModel):
|
|
74
74
|
"""Data for the aborting event."""
|
|
75
75
|
|
|
76
|
-
model_config = ConfigDict(
|
|
76
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
@docs_group('Event data')
|
|
80
80
|
class EventExitData(BaseModel):
|
|
81
81
|
"""Data for the exit event."""
|
|
82
82
|
|
|
83
|
-
model_config = ConfigDict(
|
|
83
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
@docs_group('Event data')
|
|
87
87
|
class EventCrawlerStatusData(BaseModel):
|
|
88
88
|
"""Data for the crawler status event."""
|
|
89
89
|
|
|
90
|
-
model_config = ConfigDict(
|
|
90
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
91
91
|
|
|
92
92
|
message: str
|
|
93
93
|
"""A message describing the current status of the crawler."""
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class ScreenOptions(BaseModel):
|
|
14
|
-
model_config = ConfigDict(extra='forbid',
|
|
14
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
15
15
|
|
|
16
16
|
"""Defines the screen constrains for the fingerprint generator."""
|
|
17
17
|
|
|
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
|
|
|
31
31
|
class HeaderGeneratorOptions(BaseModel):
|
|
32
32
|
"""Collection of header related attributes that can be used by the fingerprint generator."""
|
|
33
33
|
|
|
34
|
-
model_config = ConfigDict(extra='forbid',
|
|
34
|
+
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
|
|
35
35
|
|
|
36
36
|
browsers: list[SupportedBrowserType] | None = None
|
|
37
37
|
"""List of BrowserSpecifications to generate the headers for."""
|
|
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
69
|
|
|
70
70
|
if request_handling_instrumentation:
|
|
71
71
|
|
|
72
|
-
async def
|
|
72
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
73
|
with self._tracer.start_as_current_span(
|
|
74
74
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
75
|
attributes={
|
|
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
111
|
# Handpicked interesting methods to instrument
|
|
112
112
|
self._instrumented.extend(
|
|
113
113
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
114
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
115
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
116
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
117
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
118
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
# % endif
|
|
6
6
|
# % if cookiecutter.http_client == 'curl-impersonate'
|
|
7
7
|
# % do extras.append('curl-impersonate')
|
|
8
|
-
# % elif cookiecutter.http_client == '
|
|
9
|
-
# % do extras.append('
|
|
8
|
+
# % elif cookiecutter.http_client == 'httpx'
|
|
9
|
+
# % do extras.append('httpx')
|
|
10
10
|
# % endif
|
|
11
11
|
|
|
12
12
|
[project]
|
|
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class RequestListState(BaseModel):
|
|
20
|
-
model_config = ConfigDict(
|
|
20
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
21
21
|
|
|
22
22
|
next_index: Annotated[int, Field(alias='nextIndex')] = 0
|
|
23
23
|
next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
|
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -56,7 +57,7 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
56
57
|
`in_progress` is cleared.
|
|
57
58
|
"""
|
|
58
59
|
|
|
59
|
-
model_config = ConfigDict(
|
|
60
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
60
61
|
|
|
61
62
|
url_queue: Annotated[deque[str], Field(alias='urlQueue')]
|
|
62
63
|
"""Queue of URLs extracted from sitemaps and ready for processing."""
|
|
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
91
|
class SitemapRequestLoader(RequestLoader):
|
|
91
92
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
93
|
|
|
94
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
95
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
96
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
97
|
+
and the `enqueue_links` functionality.
|
|
98
|
+
|
|
93
99
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
100
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
101
|
|
|
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
107
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
108
114
|
max_buffer_size: int = 200,
|
|
109
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
110
117
|
) -> None:
|
|
111
118
|
"""Initialize the sitemap request loader.
|
|
112
119
|
|
|
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
120
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
121
128
|
When provided, allows resuming from where it left off after interruption.
|
|
122
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
123
133
|
"""
|
|
124
134
|
self._http_client = http_client
|
|
125
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
127
137
|
self._exclude = exclude
|
|
128
138
|
self._proxy_info = proxy_info
|
|
129
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
130
141
|
|
|
131
142
|
# Synchronization for queue operations
|
|
132
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
308
319
|
|
|
309
320
|
async with self._queue_lock:
|
|
310
321
|
url = state.url_queue.popleft()
|
|
311
|
-
|
|
312
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
313
331
|
state.in_progress.add(request.url)
|
|
314
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
315
333
|
self._queue_has_capacity.set()
|