crawlee 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_request.py +29 -10
- crawlee/_types.py +42 -2
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/recurring_task.py +2 -1
- crawlee/_utils/time.py +41 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +52 -14
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +135 -118
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +58 -17
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +1 -3
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/router.py +13 -3
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +10 -16
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +37 -36
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +1 -1
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -55,6 +59,7 @@ from crawlee.errors import (
|
|
|
55
59
|
RequestHandlerError,
|
|
56
60
|
SessionError,
|
|
57
61
|
UserDefinedErrorHandlerError,
|
|
62
|
+
UserHandlerTimeoutError,
|
|
58
63
|
)
|
|
59
64
|
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
60
65
|
from crawlee.http_clients import ImpitHttpClient
|
|
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
|
|
|
64
69
|
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
65
70
|
|
|
66
71
|
from ._context_pipeline import ContextPipeline
|
|
72
|
+
from ._context_utils import swaped_context
|
|
67
73
|
from ._logging_utils import (
|
|
68
74
|
get_one_line_error_summary_if_possible,
|
|
69
75
|
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
|
|
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
|
|
|
96
102
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
103
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
104
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
105
|
+
TParams = ParamSpec('TParams')
|
|
106
|
+
T = TypeVar('T')
|
|
107
|
+
|
|
99
108
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
109
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
110
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -520,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
520
529
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
521
530
|
self._unexpected_stop = True
|
|
522
531
|
|
|
532
|
+
def _wrap_handler_with_error_context(
|
|
533
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
534
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
535
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
536
|
+
|
|
537
|
+
@functools.wraps(handler)
|
|
538
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
539
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
540
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
541
|
+
error_context = context.create_modified_copy(
|
|
542
|
+
push_data=self._push_data,
|
|
543
|
+
get_key_value_store=self.get_key_value_store,
|
|
544
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
545
|
+
)
|
|
546
|
+
return await handler(error_context, exception)
|
|
547
|
+
|
|
548
|
+
return wrapped_handler
|
|
549
|
+
|
|
523
550
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
524
551
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
525
552
|
if self._max_requests_per_crawl is None:
|
|
@@ -618,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
618
645
|
|
|
619
646
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
620
647
|
"""
|
|
621
|
-
self._error_handler = handler
|
|
648
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
622
649
|
return handler
|
|
623
650
|
|
|
624
651
|
def failed_request_handler(
|
|
@@ -628,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
628
655
|
|
|
629
656
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
630
657
|
"""
|
|
631
|
-
self._failed_request_handler = handler
|
|
658
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
632
659
|
return handler
|
|
633
660
|
|
|
634
661
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -846,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
846
873
|
dataset_id: str | None = None,
|
|
847
874
|
dataset_name: str | None = None,
|
|
848
875
|
dataset_alias: str | None = None,
|
|
876
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
|
|
849
877
|
) -> None:
|
|
850
878
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
851
879
|
|
|
@@ -858,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
858
886
|
dataset_id: The ID of the Dataset to export from.
|
|
859
887
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
860
888
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
889
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
861
890
|
"""
|
|
862
891
|
dataset = await Dataset.open(
|
|
863
892
|
id=dataset_id,
|
|
@@ -867,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
867
896
|
configuration=self._service_locator.get_configuration(),
|
|
868
897
|
)
|
|
869
898
|
|
|
870
|
-
path =
|
|
871
|
-
dst = path.open('w', newline='')
|
|
899
|
+
path = Path(path)
|
|
872
900
|
|
|
873
901
|
if path.suffix == '.csv':
|
|
874
|
-
|
|
902
|
+
dst = StringIO()
|
|
903
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
904
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
905
|
+
await atomic_write(path, dst.getvalue())
|
|
875
906
|
elif path.suffix == '.json':
|
|
876
|
-
|
|
907
|
+
dst = StringIO()
|
|
908
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
909
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
910
|
+
await atomic_write(path, dst.getvalue())
|
|
877
911
|
else:
|
|
878
912
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
879
913
|
|
|
@@ -1005,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1005
1039
|
warning_flag = True
|
|
1006
1040
|
|
|
1007
1041
|
for request in request_iterator:
|
|
1008
|
-
|
|
1042
|
+
if isinstance(request, Request):
|
|
1043
|
+
if request.enqueue_strategy != strategy:
|
|
1044
|
+
request.enqueue_strategy = strategy
|
|
1045
|
+
target_url = request.url
|
|
1046
|
+
else:
|
|
1047
|
+
target_url = request
|
|
1009
1048
|
parsed_target_url = urlparse(target_url)
|
|
1010
1049
|
|
|
1011
1050
|
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
|
|
@@ -1043,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1043
1082
|
return target_url.hostname == origin_url.hostname
|
|
1044
1083
|
|
|
1045
1084
|
if strategy == 'same-domain':
|
|
1046
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1047
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1085
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1086
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1048
1087
|
return origin_domain == target_domain
|
|
1049
1088
|
|
|
1050
1089
|
if strategy == 'same-origin':
|
|
@@ -1102,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1102
1141
|
request.retry_count += 1
|
|
1103
1142
|
reduced_error = str(error).split('\n')[0]
|
|
1104
1143
|
self.log.warning(
|
|
1105
|
-
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1144
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
|
|
1106
1145
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1107
1146
|
)
|
|
1108
1147
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1113,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1113
1152
|
except Exception as e:
|
|
1114
1153
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1115
1154
|
else:
|
|
1116
|
-
if new_request is not None:
|
|
1117
|
-
|
|
1155
|
+
if new_request is not None and new_request != request:
|
|
1156
|
+
await request_manager.add_request(new_request)
|
|
1157
|
+
await self._mark_request_as_handled(request)
|
|
1158
|
+
return
|
|
1118
1159
|
|
|
1119
1160
|
await request_manager.reclaim_request(request)
|
|
1120
1161
|
else:
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
timeout=self._internal_timeout,
|
|
1124
|
-
timeout_message='Marking request as handled timed out after '
|
|
1125
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1126
|
-
logger=self._logger,
|
|
1127
|
-
max_retries=3,
|
|
1128
|
-
)
|
|
1162
|
+
request.state = RequestState.ERROR
|
|
1163
|
+
await self._mark_request_as_handled(request)
|
|
1129
1164
|
await self._handle_failed_request(context, error)
|
|
1130
1165
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1131
1166
|
|
|
@@ -1140,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1140
1175
|
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1141
1176
|
logger=self._logger,
|
|
1142
1177
|
)
|
|
1143
|
-
|
|
1144
|
-
context.request.state = RequestState.DONE
|
|
1145
1178
|
except UserDefinedErrorHandlerError:
|
|
1146
1179
|
context.request.state = RequestState.ERROR
|
|
1147
1180
|
raise
|
|
@@ -1174,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1174
1207
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1175
1208
|
) -> None:
|
|
1176
1209
|
if need_mark and isinstance(request, Request):
|
|
1177
|
-
request_manager = await self.get_request_manager()
|
|
1178
|
-
|
|
1179
|
-
await wait_for(
|
|
1180
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1181
|
-
timeout=self._internal_timeout,
|
|
1182
|
-
timeout_message='Marking request as handled timed out after '
|
|
1183
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1184
|
-
logger=self._logger,
|
|
1185
|
-
max_retries=3,
|
|
1186
|
-
)
|
|
1187
1210
|
request.state = RequestState.SKIPPED
|
|
1211
|
+
await self._mark_request_as_handled(request)
|
|
1188
1212
|
|
|
1189
1213
|
url = request.url if isinstance(request, Request) else request
|
|
1190
1214
|
|
|
@@ -1204,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1204
1228
|
|
|
1205
1229
|
if (
|
|
1206
1230
|
isinstance(error, asyncio.exceptions.TimeoutError)
|
|
1231
|
+
and traceback_parts
|
|
1207
1232
|
and self._request_handler_timeout_text in traceback_parts[-1]
|
|
1208
|
-
):
|
|
1233
|
+
) or isinstance(error, UserHandlerTimeoutError):
|
|
1209
1234
|
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
1210
|
-
used_traceback_parts.
|
|
1235
|
+
used_traceback_parts.extend(traceback_parts[-1:])
|
|
1211
1236
|
|
|
1212
1237
|
return ''.join(used_traceback_parts).strip('\n')
|
|
1213
1238
|
|
|
@@ -1256,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1256
1281
|
else:
|
|
1257
1282
|
yield Request.from_url(url)
|
|
1258
1283
|
|
|
1259
|
-
async def
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
-
configuration=self._service_locator.get_configuration(),
|
|
1281
|
-
)
|
|
1282
|
-
else:
|
|
1283
|
-
request_manager = base_request_manager
|
|
1284
|
-
|
|
1285
|
-
requests = list[Request]()
|
|
1286
|
-
|
|
1287
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1288
|
-
|
|
1289
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1284
|
+
async def _add_requests(
|
|
1285
|
+
self,
|
|
1286
|
+
context: BasicCrawlingContext,
|
|
1287
|
+
requests: Sequence[str | Request],
|
|
1288
|
+
rq_id: str | None = None,
|
|
1289
|
+
rq_name: str | None = None,
|
|
1290
|
+
rq_alias: str | None = None,
|
|
1291
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1292
|
+
) -> None:
|
|
1293
|
+
"""Add requests method aware of the crawling context."""
|
|
1294
|
+
if rq_id or rq_name or rq_alias:
|
|
1295
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1296
|
+
id=rq_id,
|
|
1297
|
+
name=rq_name,
|
|
1298
|
+
alias=rq_alias,
|
|
1299
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1300
|
+
configuration=self._service_locator.get_configuration(),
|
|
1301
|
+
)
|
|
1302
|
+
else:
|
|
1303
|
+
request_manager = await self.get_request_manager()
|
|
1290
1304
|
|
|
1291
|
-
|
|
1305
|
+
context_aware_requests = list[Request]()
|
|
1306
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1307
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1308
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1309
|
+
for dst_request in filter_requests_iterator:
|
|
1310
|
+
# Update the crawl depth of the request.
|
|
1311
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1292
1312
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
)
|
|
1313
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1314
|
+
context_aware_requests.append(dst_request)
|
|
1296
1315
|
|
|
1297
|
-
|
|
1298
|
-
# Update the crawl depth of the request.
|
|
1299
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1316
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1300
1317
|
|
|
1301
|
-
|
|
1302
|
-
|
|
1318
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1319
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1320
|
+
result = self._context_result_map[context]
|
|
1303
1321
|
|
|
1304
|
-
|
|
1322
|
+
for add_requests_call in result.add_requests_calls:
|
|
1323
|
+
await self._add_requests(context, **add_requests_call)
|
|
1305
1324
|
|
|
1306
1325
|
for push_data_call in result.push_data_calls:
|
|
1307
1326
|
await self._push_data(**push_data_call)
|
|
1308
1327
|
|
|
1309
1328
|
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
|
|
1310
1329
|
|
|
1330
|
+
result.apply_request_changes(target=context.request)
|
|
1331
|
+
|
|
1311
1332
|
@staticmethod
|
|
1312
1333
|
async def _commit_key_value_store_changes(
|
|
1313
1334
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
@@ -1373,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1373
1394
|
else:
|
|
1374
1395
|
session = await self._get_session()
|
|
1375
1396
|
proxy_info = await self._get_proxy_info(request, session)
|
|
1376
|
-
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
|
|
1397
|
+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
|
|
1377
1398
|
|
|
1378
1399
|
context = BasicCrawlingContext(
|
|
1379
|
-
request=request,
|
|
1400
|
+
request=result.request,
|
|
1380
1401
|
session=session,
|
|
1381
1402
|
proxy_info=proxy_info,
|
|
1382
1403
|
send_request=self._prepare_send_request_function(session, proxy_info),
|
|
@@ -1393,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1393
1414
|
try:
|
|
1394
1415
|
request.state = RequestState.REQUEST_HANDLER
|
|
1395
1416
|
|
|
1396
|
-
self._check_request_collision(context.request, context.session)
|
|
1397
|
-
|
|
1398
1417
|
try:
|
|
1399
|
-
|
|
1418
|
+
with swaped_context(context, request):
|
|
1419
|
+
self._check_request_collision(request, session)
|
|
1420
|
+
await self._run_request_handler(context=context)
|
|
1400
1421
|
except asyncio.TimeoutError as e:
|
|
1401
1422
|
raise RequestHandlerError(e, context) from e
|
|
1402
1423
|
|
|
1403
1424
|
await self._commit_request_handler_result(context)
|
|
1404
|
-
await wait_for(
|
|
1405
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1406
|
-
timeout=self._internal_timeout,
|
|
1407
|
-
timeout_message='Marking request as handled timed out after '
|
|
1408
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1409
|
-
logger=self._logger,
|
|
1410
|
-
max_retries=3,
|
|
1411
|
-
)
|
|
1412
1425
|
|
|
1413
1426
|
request.state = RequestState.DONE
|
|
1414
1427
|
|
|
1415
|
-
|
|
1416
|
-
|
|
1428
|
+
await self._mark_request_as_handled(request)
|
|
1429
|
+
|
|
1430
|
+
if session and session.is_usable:
|
|
1431
|
+
session.mark_good()
|
|
1417
1432
|
|
|
1418
1433
|
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1419
1434
|
|
|
1420
1435
|
except RequestCollisionError as request_error:
|
|
1421
|
-
|
|
1436
|
+
request.no_retry = True
|
|
1422
1437
|
await self._handle_request_error(context, request_error)
|
|
1423
1438
|
|
|
1424
1439
|
except RequestHandlerError as primary_error:
|
|
@@ -1433,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1433
1448
|
await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
|
|
1434
1449
|
|
|
1435
1450
|
except SessionError as session_error:
|
|
1436
|
-
if not
|
|
1451
|
+
if not session:
|
|
1437
1452
|
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
|
|
1438
1453
|
|
|
1439
1454
|
if self._error_handler:
|
|
@@ -1443,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1443
1458
|
exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
|
|
1444
1459
|
self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
|
|
1445
1460
|
|
|
1446
|
-
|
|
1461
|
+
if session:
|
|
1462
|
+
session.retire()
|
|
1447
1463
|
|
|
1448
1464
|
# Increment session rotation count.
|
|
1449
|
-
|
|
1465
|
+
request.session_rotation_count = (request.session_rotation_count or 0) + 1
|
|
1450
1466
|
|
|
1451
1467
|
await request_manager.reclaim_request(request)
|
|
1452
1468
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1453
1469
|
else:
|
|
1454
|
-
await
|
|
1455
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1456
|
-
timeout=self._internal_timeout,
|
|
1457
|
-
timeout_message='Marking request as handled timed out after '
|
|
1458
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1459
|
-
logger=self._logger,
|
|
1460
|
-
max_retries=3,
|
|
1461
|
-
)
|
|
1470
|
+
await self._mark_request_as_handled(request)
|
|
1462
1471
|
|
|
1463
1472
|
await self._handle_failed_request(context, session_error)
|
|
1464
1473
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1466,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1466
1475
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1467
1476
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1468
1477
|
|
|
1469
|
-
await
|
|
1470
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1471
|
-
timeout=self._internal_timeout,
|
|
1472
|
-
timeout_message='Marking request as handled timed out after '
|
|
1473
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1474
|
-
logger=self._logger,
|
|
1475
|
-
max_retries=3,
|
|
1476
|
-
)
|
|
1478
|
+
await self._mark_request_as_handled(request)
|
|
1477
1479
|
|
|
1478
1480
|
except ContextPipelineInitializationError as initialization_error:
|
|
1479
1481
|
self._logger.debug(
|
|
@@ -1491,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1491
1493
|
raise
|
|
1492
1494
|
|
|
1493
1495
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1496
|
+
context.request.state = RequestState.BEFORE_NAV
|
|
1497
|
+
await self._context_pipeline(
|
|
1498
|
+
context,
|
|
1499
|
+
lambda final_context: wait_for(
|
|
1500
|
+
lambda: self.router(final_context),
|
|
1501
|
+
timeout=self._request_handler_timeout,
|
|
1502
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1503
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1504
|
+
logger=self._logger,
|
|
1505
|
+
),
|
|
1500
1506
|
)
|
|
1501
1507
|
|
|
1502
1508
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1644,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1644
1650
|
)
|
|
1645
1651
|
|
|
1646
1652
|
self._previous_crawler_state = current_state
|
|
1653
|
+
|
|
1654
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1655
|
+
request_manager = await self.get_request_manager()
|
|
1656
|
+
await wait_for(
|
|
1657
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1658
|
+
timeout=self._internal_timeout,
|
|
1659
|
+
timeout_message='Marking request as handled timed out after '
|
|
1660
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1661
|
+
logger=self._logger,
|
|
1662
|
+
max_retries=3,
|
|
1663
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
from crawlee._request import Request
|
|
10
|
+
|
|
11
|
+
from ._basic_crawling_context import BasicCrawlingContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def swaped_context(
|
|
16
|
+
context: BasicCrawlingContext,
|
|
17
|
+
request: Request,
|
|
18
|
+
) -> Iterator[None]:
|
|
19
|
+
"""Replace context's isolated copies with originals after handler execution."""
|
|
20
|
+
try:
|
|
21
|
+
yield
|
|
22
|
+
finally:
|
|
23
|
+
# Restore original context state to avoid side effects between different handlers.
|
|
24
|
+
object.__setattr__(context, 'request', request)
|
|
@@ -2,9 +2,21 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
import crawlee.errors
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _get_only_innermost_exception(error: BaseException) -> BaseException:
|
|
7
|
-
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
9
|
+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
10
|
+
|
|
11
|
+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
|
|
12
|
+
"""
|
|
13
|
+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
|
|
14
|
+
if error.__cause__:
|
|
15
|
+
return error.__cause__
|
|
16
|
+
if error.__context__:
|
|
17
|
+
return error.__context__
|
|
18
|
+
return error
|
|
19
|
+
|
|
8
20
|
if error.__cause__:
|
|
9
21
|
return _get_only_innermost_exception(error.__cause__)
|
|
10
22
|
if error.__context__:
|
|
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
|
|
|
34
46
|
|
|
35
47
|
|
|
36
48
|
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
37
|
-
timeout_error: asyncio.exceptions.TimeoutError,
|
|
49
|
+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
|
|
38
50
|
) -> list[str]:
|
|
39
51
|
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
|
|
40
52
|
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
|
|
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
|
43
55
|
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
44
56
|
innermost_error = _get_only_innermost_exception(error)
|
|
45
57
|
return traceback.format_exception(
|
|
46
|
-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=
|
|
58
|
+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
|
|
47
59
|
)
|
|
48
60
|
|
|
49
61
|
|
|
50
62
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
63
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
|
|
64
|
+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
65
|
+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
|
|
66
|
+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
|
|
67
|
+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
|
|
68
|
+
# code and third line the topmost user error
|
|
69
|
+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
70
|
+
relevant_index_from_start = 3
|
|
71
|
+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
|
|
53
72
|
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
73
|
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
74
|
# point to deep internals.
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|