crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recurring_task.py +12 -3
- crawlee/_utils/sitemap.py +12 -5
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/browsers/_browser_pool.py +1 -1
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +138 -124
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +1 -3
- crawlee/request_loaders/_sitemap_request_loader.py +18 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +2 -21
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
- crawlee/storage_clients/_redis/_client_mixin.py +1 -4
- crawlee/storage_clients/_redis/_dataset_client.py +6 -2
- crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
- crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
- crawlee/storage_clients/_redis/_storage_client.py +12 -9
- crawlee/storage_clients/_redis/_utils.py +1 -1
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -55,8 +59,9 @@ from crawlee.errors import (
|
|
|
55
59
|
RequestHandlerError,
|
|
56
60
|
SessionError,
|
|
57
61
|
UserDefinedErrorHandlerError,
|
|
62
|
+
UserHandlerTimeoutError,
|
|
58
63
|
)
|
|
59
|
-
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
64
|
+
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
60
65
|
from crawlee.http_clients import ImpitHttpClient
|
|
61
66
|
from crawlee.router import Router
|
|
62
67
|
from crawlee.sessions import SessionPool
|
|
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
|
|
|
64
69
|
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
65
70
|
|
|
66
71
|
from ._context_pipeline import ContextPipeline
|
|
72
|
+
from ._context_utils import swapped_context
|
|
67
73
|
from ._logging_utils import (
|
|
68
74
|
get_one_line_error_summary_if_possible,
|
|
69
75
|
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
|
|
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
|
|
|
96
102
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
103
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
104
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
105
|
+
TParams = ParamSpec('TParams')
|
|
106
|
+
T = TypeVar('T')
|
|
107
|
+
|
|
99
108
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
109
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
110
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
401
410
|
self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
|
|
402
411
|
|
|
403
412
|
# Context pipeline
|
|
404
|
-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
|
|
413
|
+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
|
|
405
414
|
|
|
406
415
|
# Crawl settings
|
|
407
416
|
self._max_request_retries = max_request_retries
|
|
@@ -520,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
520
529
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
521
530
|
self._unexpected_stop = True
|
|
522
531
|
|
|
532
|
+
def _wrap_handler_with_error_context(
|
|
533
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
534
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
535
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
536
|
+
|
|
537
|
+
@functools.wraps(handler)
|
|
538
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
539
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
|
|
540
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
541
|
+
error_context = context.create_modified_copy(
|
|
542
|
+
push_data=self._push_data,
|
|
543
|
+
get_key_value_store=self.get_key_value_store,
|
|
544
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
545
|
+
)
|
|
546
|
+
return await handler(error_context, exception)
|
|
547
|
+
|
|
548
|
+
return wrapped_handler
|
|
549
|
+
|
|
523
550
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
524
551
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
525
552
|
if self._max_requests_per_crawl is None:
|
|
@@ -618,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
618
645
|
|
|
619
646
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
620
647
|
"""
|
|
621
|
-
self._error_handler = handler
|
|
648
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
622
649
|
return handler
|
|
623
650
|
|
|
624
651
|
def failed_request_handler(
|
|
@@ -628,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
628
655
|
|
|
629
656
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
630
657
|
"""
|
|
631
|
-
self._failed_request_handler = handler
|
|
658
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
632
659
|
return handler
|
|
633
660
|
|
|
634
661
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -747,13 +774,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
747
774
|
|
|
748
775
|
async with AsyncExitStack() as exit_stack:
|
|
749
776
|
for context in contexts_to_enter:
|
|
750
|
-
await exit_stack.enter_async_context(context) #
|
|
777
|
+
await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
|
|
751
778
|
|
|
752
779
|
await self._autoscaled_pool.run()
|
|
753
780
|
|
|
754
|
-
# Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
|
|
755
|
-
event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
|
|
756
|
-
|
|
757
781
|
async def add_requests(
|
|
758
782
|
self,
|
|
759
783
|
requests: Sequence[str | Request],
|
|
@@ -849,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
849
873
|
dataset_id: str | None = None,
|
|
850
874
|
dataset_name: str | None = None,
|
|
851
875
|
dataset_alias: str | None = None,
|
|
876
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
|
|
852
877
|
) -> None:
|
|
853
878
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
854
879
|
|
|
@@ -861,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
861
886
|
dataset_id: The ID of the Dataset to export from.
|
|
862
887
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
863
888
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
889
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
864
890
|
"""
|
|
865
891
|
dataset = await Dataset.open(
|
|
866
892
|
id=dataset_id,
|
|
@@ -870,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
870
896
|
configuration=self._service_locator.get_configuration(),
|
|
871
897
|
)
|
|
872
898
|
|
|
873
|
-
path =
|
|
874
|
-
dst = path.open('w', newline='')
|
|
899
|
+
path = Path(path)
|
|
875
900
|
|
|
876
901
|
if path.suffix == '.csv':
|
|
877
|
-
|
|
902
|
+
dst = StringIO()
|
|
903
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
904
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
905
|
+
await atomic_write(path, dst.getvalue())
|
|
878
906
|
elif path.suffix == '.json':
|
|
879
|
-
|
|
907
|
+
dst = StringIO()
|
|
908
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
909
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
910
|
+
await atomic_write(path, dst.getvalue())
|
|
880
911
|
else:
|
|
881
912
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
882
913
|
|
|
@@ -1008,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1008
1039
|
warning_flag = True
|
|
1009
1040
|
|
|
1010
1041
|
for request in request_iterator:
|
|
1011
|
-
|
|
1042
|
+
if isinstance(request, Request):
|
|
1043
|
+
if request.enqueue_strategy != strategy:
|
|
1044
|
+
request.enqueue_strategy = strategy
|
|
1045
|
+
target_url = request.url
|
|
1046
|
+
else:
|
|
1047
|
+
target_url = request
|
|
1012
1048
|
parsed_target_url = urlparse(target_url)
|
|
1013
1049
|
|
|
1014
1050
|
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
|
|
@@ -1046,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1046
1082
|
return target_url.hostname == origin_url.hostname
|
|
1047
1083
|
|
|
1048
1084
|
if strategy == 'same-domain':
|
|
1049
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1050
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1085
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1086
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1051
1087
|
return origin_domain == target_domain
|
|
1052
1088
|
|
|
1053
1089
|
if strategy == 'same-origin':
|
|
@@ -1105,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1105
1141
|
request.retry_count += 1
|
|
1106
1142
|
reduced_error = str(error).split('\n')[0]
|
|
1107
1143
|
self.log.warning(
|
|
1108
|
-
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1144
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
|
|
1109
1145
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1110
1146
|
)
|
|
1111
1147
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1116,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1116
1152
|
except Exception as e:
|
|
1117
1153
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1118
1154
|
else:
|
|
1119
|
-
if new_request is not None:
|
|
1120
|
-
|
|
1155
|
+
if new_request is not None and new_request != request:
|
|
1156
|
+
await request_manager.add_request(new_request)
|
|
1157
|
+
await self._mark_request_as_handled(request)
|
|
1158
|
+
return
|
|
1121
1159
|
|
|
1122
1160
|
await request_manager.reclaim_request(request)
|
|
1123
1161
|
else:
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
timeout=self._internal_timeout,
|
|
1127
|
-
timeout_message='Marking request as handled timed out after '
|
|
1128
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1129
|
-
logger=self._logger,
|
|
1130
|
-
max_retries=3,
|
|
1131
|
-
)
|
|
1162
|
+
request.state = RequestState.ERROR
|
|
1163
|
+
await self._mark_request_as_handled(request)
|
|
1132
1164
|
await self._handle_failed_request(context, error)
|
|
1133
1165
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1134
1166
|
|
|
@@ -1143,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1143
1175
|
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1144
1176
|
logger=self._logger,
|
|
1145
1177
|
)
|
|
1146
|
-
|
|
1147
|
-
context.request.state = RequestState.DONE
|
|
1148
1178
|
except UserDefinedErrorHandlerError:
|
|
1149
1179
|
context.request.state = RequestState.ERROR
|
|
1150
1180
|
raise
|
|
@@ -1177,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1177
1207
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1178
1208
|
) -> None:
|
|
1179
1209
|
if need_mark and isinstance(request, Request):
|
|
1180
|
-
request_manager = await self.get_request_manager()
|
|
1181
|
-
|
|
1182
|
-
await wait_for(
|
|
1183
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1184
|
-
timeout=self._internal_timeout,
|
|
1185
|
-
timeout_message='Marking request as handled timed out after '
|
|
1186
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1187
|
-
logger=self._logger,
|
|
1188
|
-
max_retries=3,
|
|
1189
|
-
)
|
|
1190
1210
|
request.state = RequestState.SKIPPED
|
|
1211
|
+
await self._mark_request_as_handled(request)
|
|
1191
1212
|
|
|
1192
1213
|
url = request.url if isinstance(request, Request) else request
|
|
1193
1214
|
|
|
@@ -1207,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1207
1228
|
|
|
1208
1229
|
if (
|
|
1209
1230
|
isinstance(error, asyncio.exceptions.TimeoutError)
|
|
1231
|
+
and traceback_parts
|
|
1210
1232
|
and self._request_handler_timeout_text in traceback_parts[-1]
|
|
1211
|
-
):
|
|
1233
|
+
) or isinstance(error, UserHandlerTimeoutError):
|
|
1212
1234
|
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
1213
|
-
used_traceback_parts.
|
|
1235
|
+
used_traceback_parts.extend(traceback_parts[-1:])
|
|
1214
1236
|
|
|
1215
1237
|
return ''.join(used_traceback_parts).strip('\n')
|
|
1216
1238
|
|
|
@@ -1259,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1259
1281
|
else:
|
|
1260
1282
|
yield Request.from_url(url)
|
|
1261
1283
|
|
|
1262
|
-
async def
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1283
|
-
configuration=self._service_locator.get_configuration(),
|
|
1284
|
-
)
|
|
1285
|
-
else:
|
|
1286
|
-
request_manager = base_request_manager
|
|
1287
|
-
|
|
1288
|
-
requests = list[Request]()
|
|
1289
|
-
|
|
1290
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1291
|
-
|
|
1292
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1284
|
+
async def _add_requests(
|
|
1285
|
+
self,
|
|
1286
|
+
context: BasicCrawlingContext,
|
|
1287
|
+
requests: Sequence[str | Request],
|
|
1288
|
+
rq_id: str | None = None,
|
|
1289
|
+
rq_name: str | None = None,
|
|
1290
|
+
rq_alias: str | None = None,
|
|
1291
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1292
|
+
) -> None:
|
|
1293
|
+
"""Add requests method aware of the crawling context."""
|
|
1294
|
+
if rq_id or rq_name or rq_alias:
|
|
1295
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1296
|
+
id=rq_id,
|
|
1297
|
+
name=rq_name,
|
|
1298
|
+
alias=rq_alias,
|
|
1299
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1300
|
+
configuration=self._service_locator.get_configuration(),
|
|
1301
|
+
)
|
|
1302
|
+
else:
|
|
1303
|
+
request_manager = await self.get_request_manager()
|
|
1293
1304
|
|
|
1294
|
-
|
|
1305
|
+
context_aware_requests = list[Request]()
|
|
1306
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1307
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1308
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1309
|
+
for dst_request in filter_requests_iterator:
|
|
1310
|
+
# Update the crawl depth of the request.
|
|
1311
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1295
1312
|
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
)
|
|
1313
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1314
|
+
context_aware_requests.append(dst_request)
|
|
1299
1315
|
|
|
1300
|
-
|
|
1301
|
-
# Update the crawl depth of the request.
|
|
1302
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1316
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1303
1317
|
|
|
1304
|
-
|
|
1305
|
-
|
|
1318
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1319
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1320
|
+
result = self._context_result_map[context]
|
|
1306
1321
|
|
|
1307
|
-
|
|
1322
|
+
for add_requests_call in result.add_requests_calls:
|
|
1323
|
+
await self._add_requests(context, **add_requests_call)
|
|
1308
1324
|
|
|
1309
1325
|
for push_data_call in result.push_data_calls:
|
|
1310
1326
|
await self._push_data(**push_data_call)
|
|
1311
1327
|
|
|
1312
1328
|
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
|
|
1313
1329
|
|
|
1330
|
+
result.apply_request_changes(target=context.request)
|
|
1331
|
+
|
|
1314
1332
|
@staticmethod
|
|
1315
1333
|
async def _commit_key_value_store_changes(
|
|
1316
1334
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
@@ -1376,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1376
1394
|
else:
|
|
1377
1395
|
session = await self._get_session()
|
|
1378
1396
|
proxy_info = await self._get_proxy_info(request, session)
|
|
1379
|
-
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
|
|
1397
|
+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
|
|
1380
1398
|
|
|
1381
1399
|
context = BasicCrawlingContext(
|
|
1382
|
-
request=request,
|
|
1400
|
+
request=result.request,
|
|
1383
1401
|
session=session,
|
|
1384
1402
|
proxy_info=proxy_info,
|
|
1385
1403
|
send_request=self._prepare_send_request_function(session, proxy_info),
|
|
@@ -1396,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1396
1414
|
try:
|
|
1397
1415
|
request.state = RequestState.REQUEST_HANDLER
|
|
1398
1416
|
|
|
1399
|
-
self._check_request_collision(context.request, context.session)
|
|
1400
|
-
|
|
1401
1417
|
try:
|
|
1402
|
-
|
|
1418
|
+
with swapped_context(context, request):
|
|
1419
|
+
self._check_request_collision(request, session)
|
|
1420
|
+
await self._run_request_handler(context=context)
|
|
1403
1421
|
except asyncio.TimeoutError as e:
|
|
1404
1422
|
raise RequestHandlerError(e, context) from e
|
|
1405
1423
|
|
|
1406
1424
|
await self._commit_request_handler_result(context)
|
|
1407
|
-
await wait_for(
|
|
1408
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1409
|
-
timeout=self._internal_timeout,
|
|
1410
|
-
timeout_message='Marking request as handled timed out after '
|
|
1411
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1412
|
-
logger=self._logger,
|
|
1413
|
-
max_retries=3,
|
|
1414
|
-
)
|
|
1415
1425
|
|
|
1416
1426
|
request.state = RequestState.DONE
|
|
1417
1427
|
|
|
1418
|
-
|
|
1419
|
-
|
|
1428
|
+
await self._mark_request_as_handled(request)
|
|
1429
|
+
|
|
1430
|
+
if session and session.is_usable:
|
|
1431
|
+
session.mark_good()
|
|
1420
1432
|
|
|
1421
1433
|
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1422
1434
|
|
|
1423
1435
|
except RequestCollisionError as request_error:
|
|
1424
|
-
|
|
1436
|
+
request.no_retry = True
|
|
1425
1437
|
await self._handle_request_error(context, request_error)
|
|
1426
1438
|
|
|
1427
1439
|
except RequestHandlerError as primary_error:
|
|
@@ -1436,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1436
1448
|
await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
|
|
1437
1449
|
|
|
1438
1450
|
except SessionError as session_error:
|
|
1439
|
-
if not
|
|
1451
|
+
if not session:
|
|
1440
1452
|
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
|
|
1441
1453
|
|
|
1442
1454
|
if self._error_handler:
|
|
@@ -1446,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1446
1458
|
exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
|
|
1447
1459
|
self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
|
|
1448
1460
|
|
|
1449
|
-
|
|
1461
|
+
if session:
|
|
1462
|
+
session.retire()
|
|
1450
1463
|
|
|
1451
1464
|
# Increment session rotation count.
|
|
1452
|
-
|
|
1465
|
+
request.session_rotation_count = (request.session_rotation_count or 0) + 1
|
|
1453
1466
|
|
|
1454
1467
|
await request_manager.reclaim_request(request)
|
|
1455
1468
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1456
1469
|
else:
|
|
1457
|
-
await
|
|
1458
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1459
|
-
timeout=self._internal_timeout,
|
|
1460
|
-
timeout_message='Marking request as handled timed out after '
|
|
1461
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1462
|
-
logger=self._logger,
|
|
1463
|
-
max_retries=3,
|
|
1464
|
-
)
|
|
1470
|
+
await self._mark_request_as_handled(request)
|
|
1465
1471
|
|
|
1466
1472
|
await self._handle_failed_request(context, session_error)
|
|
1467
1473
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1469,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1469
1475
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1470
1476
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1471
1477
|
|
|
1472
|
-
await
|
|
1473
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1474
|
-
timeout=self._internal_timeout,
|
|
1475
|
-
timeout_message='Marking request as handled timed out after '
|
|
1476
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1477
|
-
logger=self._logger,
|
|
1478
|
-
max_retries=3,
|
|
1479
|
-
)
|
|
1478
|
+
await self._mark_request_as_handled(request)
|
|
1480
1479
|
|
|
1481
1480
|
except ContextPipelineInitializationError as initialization_error:
|
|
1482
1481
|
self._logger.debug(
|
|
@@ -1494,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1494
1493
|
raise
|
|
1495
1494
|
|
|
1496
1495
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1496
|
+
context.request.state = RequestState.BEFORE_NAV
|
|
1497
|
+
await self._context_pipeline(
|
|
1498
|
+
context,
|
|
1499
|
+
lambda final_context: wait_for(
|
|
1500
|
+
lambda: self.router(final_context),
|
|
1501
|
+
timeout=self._request_handler_timeout,
|
|
1502
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1503
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1504
|
+
logger=self._logger,
|
|
1505
|
+
),
|
|
1503
1506
|
)
|
|
1504
1507
|
|
|
1505
1508
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1647,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1647
1650
|
)
|
|
1648
1651
|
|
|
1649
1652
|
self._previous_crawler_state = current_state
|
|
1653
|
+
|
|
1654
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1655
|
+
request_manager = await self.get_request_manager()
|
|
1656
|
+
await wait_for(
|
|
1657
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1658
|
+
timeout=self._internal_timeout,
|
|
1659
|
+
timeout_message='Marking request as handled timed out after '
|
|
1660
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1661
|
+
logger=self._logger,
|
|
1662
|
+
max_retries=3,
|
|
1663
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
from crawlee._request import Request
|
|
10
|
+
|
|
11
|
+
from ._basic_crawling_context import BasicCrawlingContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def swapped_context(
|
|
16
|
+
context: BasicCrawlingContext,
|
|
17
|
+
request: Request,
|
|
18
|
+
) -> Iterator[None]:
|
|
19
|
+
"""Replace context's isolated copies with originals after handler execution."""
|
|
20
|
+
try:
|
|
21
|
+
yield
|
|
22
|
+
finally:
|
|
23
|
+
# Restore original context state to avoid side effects between different handlers.
|
|
24
|
+
object.__setattr__(context, 'request', request)
|
|
@@ -2,9 +2,21 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
import crawlee.errors
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _get_only_innermost_exception(error: BaseException) -> BaseException:
|
|
7
|
-
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
9
|
+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
10
|
+
|
|
11
|
+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
|
|
12
|
+
"""
|
|
13
|
+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
|
|
14
|
+
if error.__cause__:
|
|
15
|
+
return error.__cause__
|
|
16
|
+
if error.__context__:
|
|
17
|
+
return error.__context__
|
|
18
|
+
return error
|
|
19
|
+
|
|
8
20
|
if error.__cause__:
|
|
9
21
|
return _get_only_innermost_exception(error.__cause__)
|
|
10
22
|
if error.__context__:
|
|
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
|
|
|
34
46
|
|
|
35
47
|
|
|
36
48
|
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
37
|
-
timeout_error: asyncio.exceptions.TimeoutError,
|
|
49
|
+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
|
|
38
50
|
) -> list[str]:
|
|
39
51
|
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
|
|
40
52
|
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
|
|
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
|
43
55
|
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
44
56
|
innermost_error = _get_only_innermost_exception(error)
|
|
45
57
|
return traceback.format_exception(
|
|
46
|
-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=
|
|
58
|
+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
|
|
47
59
|
)
|
|
48
60
|
|
|
49
61
|
|
|
50
62
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
63
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
|
|
64
|
+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
65
|
+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
|
|
66
|
+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
|
|
67
|
+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
|
|
68
|
+
# code and third line the topmost user error
|
|
69
|
+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
70
|
+
relevant_index_from_start = 3
|
|
71
|
+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
|
|
53
72
|
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
73
|
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
74
|
# point to deep internals.
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|
|
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
|
|
|
22
22
|
@override
|
|
23
23
|
async def parse(self, response: HttpResponse) -> Selector:
|
|
24
24
|
response_body = await response.read()
|
|
25
|
-
return await asyncio.to_thread(
|
|
25
|
+
return await asyncio.to_thread(Selector, body=response_body)
|
|
26
26
|
|
|
27
27
|
@override
|
|
28
28
|
async def parse_text(self, text: str) -> Selector:
|