crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +87 -25
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +139 -96
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
|
|
|
96
100
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
101
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
102
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
103
|
+
TParams = ParamSpec('TParams')
|
|
104
|
+
T = TypeVar('T')
|
|
105
|
+
|
|
99
106
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
107
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
108
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -437,14 +444,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
437
444
|
self._statistics_log_format = statistics_log_format
|
|
438
445
|
|
|
439
446
|
# Statistics
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
447
|
+
if statistics:
|
|
448
|
+
self._statistics = statistics
|
|
449
|
+
else:
|
|
450
|
+
|
|
451
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
452
|
+
return await self.get_key_value_store()
|
|
453
|
+
|
|
454
|
+
self._statistics = cast(
|
|
455
|
+
'Statistics[TStatisticsState]',
|
|
456
|
+
Statistics.with_default_state(
|
|
457
|
+
persistence_enabled=True,
|
|
458
|
+
periodic_message_logger=self._logger,
|
|
459
|
+
statistics_log_format=self._statistics_log_format,
|
|
460
|
+
log_message='Current request statistics:',
|
|
461
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
462
|
+
),
|
|
463
|
+
)
|
|
448
464
|
|
|
449
465
|
# Additional context managers to enter and exit
|
|
450
466
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -511,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
511
527
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
512
528
|
self._unexpected_stop = True
|
|
513
529
|
|
|
530
|
+
def _wrap_handler_with_error_context(
|
|
531
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
532
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
533
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
534
|
+
|
|
535
|
+
@functools.wraps(handler)
|
|
536
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
537
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
538
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
539
|
+
error_context = context.create_modified_copy(
|
|
540
|
+
push_data=self._push_data,
|
|
541
|
+
get_key_value_store=self.get_key_value_store,
|
|
542
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
543
|
+
)
|
|
544
|
+
return await handler(error_context, exception)
|
|
545
|
+
|
|
546
|
+
return wrapped_handler
|
|
547
|
+
|
|
514
548
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
515
549
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
516
550
|
if self._max_requests_per_crawl is None:
|
|
@@ -609,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
609
643
|
|
|
610
644
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
611
645
|
"""
|
|
612
|
-
self._error_handler = handler
|
|
646
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
613
647
|
return handler
|
|
614
648
|
|
|
615
649
|
def failed_request_handler(
|
|
@@ -619,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
619
653
|
|
|
620
654
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
621
655
|
"""
|
|
622
|
-
self._failed_request_handler = handler
|
|
656
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
623
657
|
return handler
|
|
624
658
|
|
|
625
659
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -659,7 +693,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
659
693
|
request_manager = await self.get_request_manager()
|
|
660
694
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
661
695
|
await request_manager.drop()
|
|
662
|
-
self._request_manager = await RequestQueue.open(
|
|
696
|
+
self._request_manager = await RequestQueue.open(
|
|
697
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
698
|
+
configuration=self._service_locator.get_configuration(),
|
|
699
|
+
)
|
|
663
700
|
|
|
664
701
|
if requests is not None:
|
|
665
702
|
await self.add_requests(requests)
|
|
@@ -686,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
686
723
|
except CancelledError:
|
|
687
724
|
pass
|
|
688
725
|
finally:
|
|
689
|
-
await self._crawler_state_rec_task.stop()
|
|
690
726
|
if threading.current_thread() is threading.main_thread():
|
|
691
727
|
with suppress(NotImplementedError):
|
|
692
728
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -718,8 +754,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
718
754
|
async def _run_crawler(self) -> None:
|
|
719
755
|
event_manager = self._service_locator.get_event_manager()
|
|
720
756
|
|
|
721
|
-
self._crawler_state_rec_task.start()
|
|
722
|
-
|
|
723
757
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
724
758
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
725
759
|
contexts_to_enter = [
|
|
@@ -730,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
730
764
|
self._statistics,
|
|
731
765
|
self._session_pool if self._use_session_pool else None,
|
|
732
766
|
self._http_client,
|
|
767
|
+
self._crawler_state_rec_task,
|
|
733
768
|
*self._additional_context_managers,
|
|
734
769
|
)
|
|
735
770
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -836,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
836
871
|
dataset_id: str | None = None,
|
|
837
872
|
dataset_name: str | None = None,
|
|
838
873
|
dataset_alias: str | None = None,
|
|
874
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
|
|
839
875
|
) -> None:
|
|
840
876
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
841
877
|
|
|
@@ -848,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
848
884
|
dataset_id: The ID of the Dataset to export from.
|
|
849
885
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
850
886
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
887
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
851
888
|
"""
|
|
852
889
|
dataset = await Dataset.open(
|
|
853
890
|
id=dataset_id,
|
|
@@ -857,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
857
894
|
configuration=self._service_locator.get_configuration(),
|
|
858
895
|
)
|
|
859
896
|
|
|
860
|
-
path =
|
|
861
|
-
dst = path.open('w', newline='')
|
|
897
|
+
path = Path(path)
|
|
862
898
|
|
|
863
899
|
if path.suffix == '.csv':
|
|
864
|
-
|
|
900
|
+
dst = StringIO()
|
|
901
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
902
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
903
|
+
await atomic_write(path, dst.getvalue())
|
|
865
904
|
elif path.suffix == '.json':
|
|
866
|
-
|
|
905
|
+
dst = StringIO()
|
|
906
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
907
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
908
|
+
await atomic_write(path, dst.getvalue())
|
|
867
909
|
else:
|
|
868
910
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
869
911
|
|
|
@@ -944,6 +986,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
944
986
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
945
987
|
| None = None,
|
|
946
988
|
requests: Sequence[str | Request] | None = None,
|
|
989
|
+
rq_id: str | None = None,
|
|
990
|
+
rq_name: str | None = None,
|
|
991
|
+
rq_alias: str | None = None,
|
|
947
992
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
948
993
|
) -> None:
|
|
949
994
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -955,7 +1000,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
955
1000
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
956
1001
|
)
|
|
957
1002
|
# Add directly passed requests.
|
|
958
|
-
await context.add_requests(
|
|
1003
|
+
await context.add_requests(
|
|
1004
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
1005
|
+
)
|
|
959
1006
|
else:
|
|
960
1007
|
# Add requests from extracted links.
|
|
961
1008
|
await context.add_requests(
|
|
@@ -964,7 +1011,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
964
1011
|
label=label,
|
|
965
1012
|
user_data=user_data,
|
|
966
1013
|
transform_request_function=transform_request_function,
|
|
1014
|
+
**kwargs,
|
|
967
1015
|
),
|
|
1016
|
+
rq_id=rq_id,
|
|
1017
|
+
rq_name=rq_name,
|
|
1018
|
+
rq_alias=rq_alias,
|
|
968
1019
|
**kwargs,
|
|
969
1020
|
)
|
|
970
1021
|
|
|
@@ -1024,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1024
1075
|
return target_url.hostname == origin_url.hostname
|
|
1025
1076
|
|
|
1026
1077
|
if strategy == 'same-domain':
|
|
1027
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1028
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1078
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1079
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1029
1080
|
return origin_domain == target_domain
|
|
1030
1081
|
|
|
1031
1082
|
if strategy == 'same-origin':
|
|
@@ -1094,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1094
1145
|
except Exception as e:
|
|
1095
1146
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1096
1147
|
else:
|
|
1097
|
-
if new_request is not None:
|
|
1098
|
-
|
|
1148
|
+
if new_request is not None and new_request != request:
|
|
1149
|
+
await request_manager.add_request(new_request)
|
|
1150
|
+
await self._mark_request_as_handled(request)
|
|
1151
|
+
return
|
|
1099
1152
|
|
|
1100
1153
|
await request_manager.reclaim_request(request)
|
|
1101
1154
|
else:
|
|
1102
|
-
await
|
|
1103
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1104
|
-
timeout=self._internal_timeout,
|
|
1105
|
-
timeout_message='Marking request as handled timed out after '
|
|
1106
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1107
|
-
logger=self._logger,
|
|
1108
|
-
max_retries=3,
|
|
1109
|
-
)
|
|
1155
|
+
await self._mark_request_as_handled(request)
|
|
1110
1156
|
await self._handle_failed_request(context, error)
|
|
1111
1157
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1112
1158
|
|
|
@@ -1155,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1155
1201
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1156
1202
|
) -> None:
|
|
1157
1203
|
if need_mark and isinstance(request, Request):
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
await wait_for(
|
|
1161
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1162
|
-
timeout=self._internal_timeout,
|
|
1163
|
-
timeout_message='Marking request as handled timed out after '
|
|
1164
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1165
|
-
logger=self._logger,
|
|
1166
|
-
max_retries=3,
|
|
1167
|
-
)
|
|
1204
|
+
await self._mark_request_as_handled(request)
|
|
1168
1205
|
request.state = RequestState.SKIPPED
|
|
1169
1206
|
|
|
1170
1207
|
url = request.url if isinstance(request, Request) else request
|
|
@@ -1237,34 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1237
1274
|
else:
|
|
1238
1275
|
yield Request.from_url(url)
|
|
1239
1276
|
|
|
1240
|
-
async def
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1277
|
+
async def _add_requests(
|
|
1278
|
+
self,
|
|
1279
|
+
context: BasicCrawlingContext,
|
|
1280
|
+
requests: Sequence[str | Request],
|
|
1281
|
+
rq_id: str | None = None,
|
|
1282
|
+
rq_name: str | None = None,
|
|
1283
|
+
rq_alias: str | None = None,
|
|
1284
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1285
|
+
) -> None:
|
|
1286
|
+
"""Add requests method aware of the crawling context."""
|
|
1287
|
+
if rq_id or rq_name or rq_alias:
|
|
1288
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1289
|
+
id=rq_id,
|
|
1290
|
+
name=rq_name,
|
|
1291
|
+
alias=rq_alias,
|
|
1292
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1293
|
+
configuration=self._service_locator.get_configuration(),
|
|
1294
|
+
)
|
|
1295
|
+
else:
|
|
1296
|
+
request_manager = await self.get_request_manager()
|
|
1253
1297
|
|
|
1254
|
-
|
|
1298
|
+
context_aware_requests = list[Request]()
|
|
1299
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1300
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1301
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1302
|
+
for dst_request in filter_requests_iterator:
|
|
1303
|
+
# Update the crawl depth of the request.
|
|
1304
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1255
1305
|
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
)
|
|
1306
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1307
|
+
context_aware_requests.append(dst_request)
|
|
1259
1308
|
|
|
1260
|
-
|
|
1261
|
-
# Update the crawl depth of the request.
|
|
1262
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1309
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1263
1310
|
|
|
1264
|
-
|
|
1265
|
-
|
|
1311
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1312
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1313
|
+
result = self._context_result_map[context]
|
|
1266
1314
|
|
|
1267
|
-
|
|
1315
|
+
for add_requests_call in result.add_requests_calls:
|
|
1316
|
+
await self._add_requests(context, **add_requests_call)
|
|
1268
1317
|
|
|
1269
1318
|
for push_data_call in result.push_data_calls:
|
|
1270
1319
|
await self._push_data(**push_data_call)
|
|
@@ -1364,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1364
1413
|
raise RequestHandlerError(e, context) from e
|
|
1365
1414
|
|
|
1366
1415
|
await self._commit_request_handler_result(context)
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
timeout=self._internal_timeout,
|
|
1370
|
-
timeout_message='Marking request as handled timed out after '
|
|
1371
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1372
|
-
logger=self._logger,
|
|
1373
|
-
max_retries=3,
|
|
1374
|
-
)
|
|
1416
|
+
|
|
1417
|
+
await self._mark_request_as_handled(request)
|
|
1375
1418
|
|
|
1376
1419
|
request.state = RequestState.DONE
|
|
1377
1420
|
|
|
@@ -1414,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1414
1457
|
await request_manager.reclaim_request(request)
|
|
1415
1458
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1416
1459
|
else:
|
|
1417
|
-
await
|
|
1418
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1419
|
-
timeout=self._internal_timeout,
|
|
1420
|
-
timeout_message='Marking request as handled timed out after '
|
|
1421
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1422
|
-
logger=self._logger,
|
|
1423
|
-
max_retries=3,
|
|
1424
|
-
)
|
|
1460
|
+
await self._mark_request_as_handled(request)
|
|
1425
1461
|
|
|
1426
1462
|
await self._handle_failed_request(context, session_error)
|
|
1427
1463
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1429,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1429
1465
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1430
1466
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1431
1467
|
|
|
1432
|
-
await
|
|
1433
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1434
|
-
timeout=self._internal_timeout,
|
|
1435
|
-
timeout_message='Marking request as handled timed out after '
|
|
1436
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1437
|
-
logger=self._logger,
|
|
1438
|
-
max_retries=3,
|
|
1439
|
-
)
|
|
1468
|
+
await self._mark_request_as_handled(request)
|
|
1440
1469
|
|
|
1441
1470
|
except ContextPipelineInitializationError as initialization_error:
|
|
1442
1471
|
self._logger.debug(
|
|
@@ -1454,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1454
1483
|
raise
|
|
1455
1484
|
|
|
1456
1485
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1457
|
-
await
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1486
|
+
await self._context_pipeline(
|
|
1487
|
+
context,
|
|
1488
|
+
lambda final_context: wait_for(
|
|
1489
|
+
lambda: self.router(final_context),
|
|
1490
|
+
timeout=self._request_handler_timeout,
|
|
1491
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1492
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1493
|
+
logger=self._logger,
|
|
1494
|
+
),
|
|
1463
1495
|
)
|
|
1464
1496
|
|
|
1465
1497
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1607,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1607
1639
|
)
|
|
1608
1640
|
|
|
1609
1641
|
self._previous_crawler_state = current_state
|
|
1642
|
+
|
|
1643
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1644
|
+
request_manager = await self.get_request_manager()
|
|
1645
|
+
await wait_for(
|
|
1646
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1647
|
+
timeout=self._internal_timeout,
|
|
1648
|
+
timeout_message='Marking request as handled timed out after '
|
|
1649
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1650
|
+
logger=self._logger,
|
|
1651
|
+
max_retries=3,
|
|
1652
|
+
)
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|
|
@@ -3,18 +3,25 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
16
|
from crawlee._request import Request, RequestOptions
|
|
17
|
+
from crawlee._types import (
|
|
18
|
+
BasicCrawlingContext,
|
|
19
|
+
ConcurrencySettings,
|
|
20
|
+
)
|
|
15
21
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
22
|
from crawlee._utils.docs import docs_group
|
|
17
23
|
from crawlee._utils.robots import RobotsTxtFile
|
|
24
|
+
from crawlee._utils.time import SharedTimeout
|
|
18
25
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
19
26
|
from crawlee.browsers import BrowserPool
|
|
20
27
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -43,7 +50,6 @@ if TYPE_CHECKING:
|
|
|
43
50
|
|
|
44
51
|
from crawlee import RequestTransformAction
|
|
45
52
|
from crawlee._types import (
|
|
46
|
-
BasicCrawlingContext,
|
|
47
53
|
EnqueueLinksKwargs,
|
|
48
54
|
ExtractLinksFunction,
|
|
49
55
|
HttpHeaders,
|
|
@@ -105,6 +111,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
105
111
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
106
112
|
headless: bool | None = None,
|
|
107
113
|
use_incognito_pages: bool | None = None,
|
|
114
|
+
navigation_timeout: timedelta | None = None,
|
|
108
115
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
109
116
|
) -> None:
|
|
110
117
|
"""Initialize a new instance.
|
|
@@ -113,7 +120,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
113
120
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
114
121
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
115
122
|
and local storage.
|
|
116
|
-
browser_type: The type of browser to launch
|
|
123
|
+
browser_type: The type of browser to launch:
|
|
124
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
125
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
126
|
+
the system.
|
|
117
127
|
This option should not be used if `browser_pool` is provided.
|
|
118
128
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
119
129
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -130,12 +140,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
130
140
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
131
141
|
own context that is destroyed once the page is closed or crashes.
|
|
132
142
|
This option should not be used if `browser_pool` is provided.
|
|
143
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
144
|
+
the request handler)
|
|
133
145
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
134
146
|
"""
|
|
135
147
|
configuration = kwargs.pop('configuration', None)
|
|
136
148
|
if configuration is not None:
|
|
137
149
|
service_locator.set_configuration(configuration)
|
|
138
150
|
|
|
151
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
152
|
+
|
|
139
153
|
if browser_pool:
|
|
140
154
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
141
155
|
if any(
|
|
@@ -152,7 +166,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
152
166
|
):
|
|
153
167
|
raise ValueError(
|
|
154
168
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
155
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
169
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
156
170
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
157
171
|
)
|
|
158
172
|
|
|
@@ -194,6 +208,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
208
|
|
|
195
209
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
210
|
|
|
211
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
212
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
213
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
214
|
+
|
|
215
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
216
|
+
|
|
197
217
|
super().__init__(**kwargs)
|
|
198
218
|
|
|
199
219
|
async def _open_page(
|
|
@@ -220,10 +240,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
220
240
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
221
241
|
)
|
|
222
242
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
243
|
+
context_id = id(pre_navigation_context)
|
|
244
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
async with browser_page_context(crawlee_page.page):
|
|
248
|
+
for hook in self._pre_navigation_hooks:
|
|
249
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
250
|
+
await hook(pre_navigation_context)
|
|
251
|
+
|
|
252
|
+
yield pre_navigation_context
|
|
253
|
+
finally:
|
|
254
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
227
255
|
|
|
228
256
|
def _prepare_request_interceptor(
|
|
229
257
|
self,
|
|
@@ -258,6 +286,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
258
286
|
Raises:
|
|
259
287
|
ValueError: If the browser pool is not initialized.
|
|
260
288
|
SessionError: If the URL cannot be loaded by the browser.
|
|
289
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
261
290
|
|
|
262
291
|
Yields:
|
|
263
292
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -289,7 +318,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
289
318
|
# Set route_handler only for current request
|
|
290
319
|
await context.page.route(context.request.url, route_handler)
|
|
291
320
|
|
|
292
|
-
|
|
321
|
+
try:
|
|
322
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
323
|
+
response = await context.page.goto(
|
|
324
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
|
|
325
|
+
)
|
|
326
|
+
except playwright.async_api.TimeoutError as exc:
|
|
327
|
+
raise asyncio.TimeoutError from exc
|
|
293
328
|
|
|
294
329
|
if response is None:
|
|
295
330
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -361,7 +396,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
361
396
|
links_iterator: Iterator[str] = iter(
|
|
362
397
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
363
398
|
)
|
|
364
|
-
|
|
399
|
+
|
|
400
|
+
# Get base URL from <base> tag if present
|
|
401
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
402
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
403
|
+
|
|
404
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
365
405
|
|
|
366
406
|
if robots_txt_file:
|
|
367
407
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -489,7 +529,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
489
529
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
490
530
|
|
|
491
531
|
browser_type: NotRequired[BrowserType]
|
|
492
|
-
"""The type of browser to launch
|
|
532
|
+
"""The type of browser to launch:
|
|
533
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
534
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
493
535
|
This option should not be used if `browser_pool` is provided."""
|
|
494
536
|
|
|
495
537
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|