crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +62 -32
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +52 -19
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +160 -134
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
- crawlee/storage_clients/_memory/_dataset_client.py +2 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_dataset_client.py +2 -2
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
- crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +3 -0
- crawlee/storages/_key_value_store.py +8 -2
- crawlee/storages/_request_queue.py +3 -0
- crawlee/storages/_storage_instance_manager.py +109 -42
- crawlee/storages/_utils.py +11 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -55,6 +59,7 @@ from crawlee.errors import (
|
|
|
55
59
|
RequestHandlerError,
|
|
56
60
|
SessionError,
|
|
57
61
|
UserDefinedErrorHandlerError,
|
|
62
|
+
UserHandlerTimeoutError,
|
|
58
63
|
)
|
|
59
64
|
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
60
65
|
from crawlee.http_clients import ImpitHttpClient
|
|
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
|
|
|
64
69
|
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
65
70
|
|
|
66
71
|
from ._context_pipeline import ContextPipeline
|
|
72
|
+
from ._context_utils import swapped_context
|
|
67
73
|
from ._logging_utils import (
|
|
68
74
|
get_one_line_error_summary_if_possible,
|
|
69
75
|
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
|
|
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
|
|
|
96
102
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
103
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
104
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
105
|
+
TParams = ParamSpec('TParams')
|
|
106
|
+
T = TypeVar('T')
|
|
107
|
+
|
|
99
108
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
109
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
110
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
401
410
|
self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
|
|
402
411
|
|
|
403
412
|
# Context pipeline
|
|
404
|
-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
|
|
413
|
+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
|
|
405
414
|
|
|
406
415
|
# Crawl settings
|
|
407
416
|
self._max_request_retries = max_request_retries
|
|
@@ -437,14 +446,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
437
446
|
self._statistics_log_format = statistics_log_format
|
|
438
447
|
|
|
439
448
|
# Statistics
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
449
|
+
if statistics:
|
|
450
|
+
self._statistics = statistics
|
|
451
|
+
else:
|
|
452
|
+
|
|
453
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
454
|
+
return await self.get_key_value_store()
|
|
455
|
+
|
|
456
|
+
self._statistics = cast(
|
|
457
|
+
'Statistics[TStatisticsState]',
|
|
458
|
+
Statistics.with_default_state(
|
|
459
|
+
persistence_enabled=True,
|
|
460
|
+
periodic_message_logger=self._logger,
|
|
461
|
+
statistics_log_format=self._statistics_log_format,
|
|
462
|
+
log_message='Current request statistics:',
|
|
463
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
464
|
+
),
|
|
465
|
+
)
|
|
448
466
|
|
|
449
467
|
# Additional context managers to enter and exit
|
|
450
468
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -511,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
511
529
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
512
530
|
self._unexpected_stop = True
|
|
513
531
|
|
|
532
|
+
def _wrap_handler_with_error_context(
|
|
533
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
534
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
535
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
536
|
+
|
|
537
|
+
@functools.wraps(handler)
|
|
538
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
539
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
|
|
540
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
541
|
+
error_context = context.create_modified_copy(
|
|
542
|
+
push_data=self._push_data,
|
|
543
|
+
get_key_value_store=self.get_key_value_store,
|
|
544
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
545
|
+
)
|
|
546
|
+
return await handler(error_context, exception)
|
|
547
|
+
|
|
548
|
+
return wrapped_handler
|
|
549
|
+
|
|
514
550
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
515
551
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
516
552
|
if self._max_requests_per_crawl is None:
|
|
@@ -609,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
609
645
|
|
|
610
646
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
611
647
|
"""
|
|
612
|
-
self._error_handler = handler
|
|
648
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
613
649
|
return handler
|
|
614
650
|
|
|
615
651
|
def failed_request_handler(
|
|
@@ -619,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
619
655
|
|
|
620
656
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
621
657
|
"""
|
|
622
|
-
self._failed_request_handler = handler
|
|
658
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
623
659
|
return handler
|
|
624
660
|
|
|
625
661
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -689,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
689
725
|
except CancelledError:
|
|
690
726
|
pass
|
|
691
727
|
finally:
|
|
692
|
-
await self._crawler_state_rec_task.stop()
|
|
693
728
|
if threading.current_thread() is threading.main_thread():
|
|
694
729
|
with suppress(NotImplementedError):
|
|
695
730
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -721,8 +756,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
721
756
|
async def _run_crawler(self) -> None:
|
|
722
757
|
event_manager = self._service_locator.get_event_manager()
|
|
723
758
|
|
|
724
|
-
self._crawler_state_rec_task.start()
|
|
725
|
-
|
|
726
759
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
727
760
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
728
761
|
contexts_to_enter = [
|
|
@@ -733,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
733
766
|
self._statistics,
|
|
734
767
|
self._session_pool if self._use_session_pool else None,
|
|
735
768
|
self._http_client,
|
|
769
|
+
self._crawler_state_rec_task,
|
|
736
770
|
*self._additional_context_managers,
|
|
737
771
|
)
|
|
738
772
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -740,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
740
774
|
|
|
741
775
|
async with AsyncExitStack() as exit_stack:
|
|
742
776
|
for context in contexts_to_enter:
|
|
743
|
-
await exit_stack.enter_async_context(context) #
|
|
777
|
+
await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
|
|
744
778
|
|
|
745
779
|
await self._autoscaled_pool.run()
|
|
746
780
|
|
|
@@ -839,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
839
873
|
dataset_id: str | None = None,
|
|
840
874
|
dataset_name: str | None = None,
|
|
841
875
|
dataset_alias: str | None = None,
|
|
876
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
|
|
842
877
|
) -> None:
|
|
843
878
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
844
879
|
|
|
@@ -851,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
851
886
|
dataset_id: The ID of the Dataset to export from.
|
|
852
887
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
853
888
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
889
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
854
890
|
"""
|
|
855
891
|
dataset = await Dataset.open(
|
|
856
892
|
id=dataset_id,
|
|
@@ -860,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
860
896
|
configuration=self._service_locator.get_configuration(),
|
|
861
897
|
)
|
|
862
898
|
|
|
863
|
-
path =
|
|
864
|
-
dst = path.open('w', newline='')
|
|
899
|
+
path = Path(path)
|
|
865
900
|
|
|
866
901
|
if path.suffix == '.csv':
|
|
867
|
-
|
|
902
|
+
dst = StringIO()
|
|
903
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
904
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
905
|
+
await atomic_write(path, dst.getvalue())
|
|
868
906
|
elif path.suffix == '.json':
|
|
869
|
-
|
|
907
|
+
dst = StringIO()
|
|
908
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
909
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
910
|
+
await atomic_write(path, dst.getvalue())
|
|
870
911
|
else:
|
|
871
912
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
872
913
|
|
|
@@ -972,6 +1013,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
972
1013
|
label=label,
|
|
973
1014
|
user_data=user_data,
|
|
974
1015
|
transform_request_function=transform_request_function,
|
|
1016
|
+
**kwargs,
|
|
975
1017
|
),
|
|
976
1018
|
rq_id=rq_id,
|
|
977
1019
|
rq_name=rq_name,
|
|
@@ -997,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
997
1039
|
warning_flag = True
|
|
998
1040
|
|
|
999
1041
|
for request in request_iterator:
|
|
1000
|
-
|
|
1042
|
+
if isinstance(request, Request):
|
|
1043
|
+
if request.enqueue_strategy != strategy:
|
|
1044
|
+
request.enqueue_strategy = strategy
|
|
1045
|
+
target_url = request.url
|
|
1046
|
+
else:
|
|
1047
|
+
target_url = request
|
|
1001
1048
|
parsed_target_url = urlparse(target_url)
|
|
1002
1049
|
|
|
1003
1050
|
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
|
|
@@ -1009,9 +1056,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1009
1056
|
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
|
|
1010
1057
|
yield request
|
|
1011
1058
|
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1059
|
+
if limit is not None:
|
|
1060
|
+
limit -= 1
|
|
1061
|
+
if limit <= 0:
|
|
1062
|
+
break
|
|
1015
1063
|
|
|
1016
1064
|
def _check_enqueue_strategy(
|
|
1017
1065
|
self,
|
|
@@ -1035,8 +1083,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1035
1083
|
return target_url.hostname == origin_url.hostname
|
|
1036
1084
|
|
|
1037
1085
|
if strategy == 'same-domain':
|
|
1038
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1039
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1086
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1087
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1040
1088
|
return origin_domain == target_domain
|
|
1041
1089
|
|
|
1042
1090
|
if strategy == 'same-origin':
|
|
@@ -1094,7 +1142,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1094
1142
|
request.retry_count += 1
|
|
1095
1143
|
reduced_error = str(error).split('\n')[0]
|
|
1096
1144
|
self.log.warning(
|
|
1097
|
-
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1145
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
|
|
1098
1146
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1099
1147
|
)
|
|
1100
1148
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1105,19 +1153,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1105
1153
|
except Exception as e:
|
|
1106
1154
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1107
1155
|
else:
|
|
1108
|
-
if new_request is not None:
|
|
1109
|
-
|
|
1156
|
+
if new_request is not None and new_request != request:
|
|
1157
|
+
await request_manager.add_request(new_request)
|
|
1158
|
+
await self._mark_request_as_handled(request)
|
|
1159
|
+
return
|
|
1110
1160
|
|
|
1111
1161
|
await request_manager.reclaim_request(request)
|
|
1112
1162
|
else:
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
timeout=self._internal_timeout,
|
|
1116
|
-
timeout_message='Marking request as handled timed out after '
|
|
1117
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1118
|
-
logger=self._logger,
|
|
1119
|
-
max_retries=3,
|
|
1120
|
-
)
|
|
1163
|
+
request.state = RequestState.ERROR
|
|
1164
|
+
await self._mark_request_as_handled(request)
|
|
1121
1165
|
await self._handle_failed_request(context, error)
|
|
1122
1166
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1123
1167
|
|
|
@@ -1132,8 +1176,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1132
1176
|
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1133
1177
|
logger=self._logger,
|
|
1134
1178
|
)
|
|
1135
|
-
|
|
1136
|
-
context.request.state = RequestState.DONE
|
|
1137
1179
|
except UserDefinedErrorHandlerError:
|
|
1138
1180
|
context.request.state = RequestState.ERROR
|
|
1139
1181
|
raise
|
|
@@ -1166,17 +1208,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1166
1208
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1167
1209
|
) -> None:
|
|
1168
1210
|
if need_mark and isinstance(request, Request):
|
|
1169
|
-
request_manager = await self.get_request_manager()
|
|
1170
|
-
|
|
1171
|
-
await wait_for(
|
|
1172
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1173
|
-
timeout=self._internal_timeout,
|
|
1174
|
-
timeout_message='Marking request as handled timed out after '
|
|
1175
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1176
|
-
logger=self._logger,
|
|
1177
|
-
max_retries=3,
|
|
1178
|
-
)
|
|
1179
1211
|
request.state = RequestState.SKIPPED
|
|
1212
|
+
await self._mark_request_as_handled(request)
|
|
1180
1213
|
|
|
1181
1214
|
url = request.url if isinstance(request, Request) else request
|
|
1182
1215
|
|
|
@@ -1196,10 +1229,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1196
1229
|
|
|
1197
1230
|
if (
|
|
1198
1231
|
isinstance(error, asyncio.exceptions.TimeoutError)
|
|
1232
|
+
and traceback_parts
|
|
1199
1233
|
and self._request_handler_timeout_text in traceback_parts[-1]
|
|
1200
|
-
):
|
|
1234
|
+
) or isinstance(error, UserHandlerTimeoutError):
|
|
1201
1235
|
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
1202
|
-
used_traceback_parts.
|
|
1236
|
+
used_traceback_parts.extend(traceback_parts[-1:])
|
|
1203
1237
|
|
|
1204
1238
|
return ''.join(used_traceback_parts).strip('\n')
|
|
1205
1239
|
|
|
@@ -1248,58 +1282,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1248
1282
|
else:
|
|
1249
1283
|
yield Request.from_url(url)
|
|
1250
1284
|
|
|
1251
|
-
async def
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1272
|
-
configuration=self._service_locator.get_configuration(),
|
|
1273
|
-
)
|
|
1274
|
-
else:
|
|
1275
|
-
request_manager = base_request_manager
|
|
1276
|
-
|
|
1277
|
-
requests = list[Request]()
|
|
1278
|
-
|
|
1279
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1280
|
-
|
|
1281
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1285
|
+
async def _add_requests(
|
|
1286
|
+
self,
|
|
1287
|
+
context: BasicCrawlingContext,
|
|
1288
|
+
requests: Sequence[str | Request],
|
|
1289
|
+
rq_id: str | None = None,
|
|
1290
|
+
rq_name: str | None = None,
|
|
1291
|
+
rq_alias: str | None = None,
|
|
1292
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1293
|
+
) -> None:
|
|
1294
|
+
"""Add requests method aware of the crawling context."""
|
|
1295
|
+
if rq_id or rq_name or rq_alias:
|
|
1296
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1297
|
+
id=rq_id,
|
|
1298
|
+
name=rq_name,
|
|
1299
|
+
alias=rq_alias,
|
|
1300
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1301
|
+
configuration=self._service_locator.get_configuration(),
|
|
1302
|
+
)
|
|
1303
|
+
else:
|
|
1304
|
+
request_manager = await self.get_request_manager()
|
|
1282
1305
|
|
|
1283
|
-
|
|
1306
|
+
context_aware_requests = list[Request]()
|
|
1307
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1308
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1309
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1310
|
+
for dst_request in filter_requests_iterator:
|
|
1311
|
+
# Update the crawl depth of the request.
|
|
1312
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1284
1313
|
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
)
|
|
1314
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1315
|
+
context_aware_requests.append(dst_request)
|
|
1288
1316
|
|
|
1289
|
-
|
|
1290
|
-
# Update the crawl depth of the request.
|
|
1291
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1317
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1292
1318
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1319
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1320
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1321
|
+
result = self._context_result_map[context]
|
|
1295
1322
|
|
|
1296
|
-
|
|
1323
|
+
for add_requests_call in result.add_requests_calls:
|
|
1324
|
+
await self._add_requests(context, **add_requests_call)
|
|
1297
1325
|
|
|
1298
1326
|
for push_data_call in result.push_data_calls:
|
|
1299
1327
|
await self._push_data(**push_data_call)
|
|
1300
1328
|
|
|
1301
1329
|
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
|
|
1302
1330
|
|
|
1331
|
+
result.apply_request_changes(target=context.request)
|
|
1332
|
+
|
|
1303
1333
|
@staticmethod
|
|
1304
1334
|
async def _commit_key_value_store_changes(
|
|
1305
1335
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
@@ -1365,10 +1395,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1365
1395
|
else:
|
|
1366
1396
|
session = await self._get_session()
|
|
1367
1397
|
proxy_info = await self._get_proxy_info(request, session)
|
|
1368
|
-
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
|
|
1398
|
+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
|
|
1369
1399
|
|
|
1370
1400
|
context = BasicCrawlingContext(
|
|
1371
|
-
request=request,
|
|
1401
|
+
request=result.request,
|
|
1372
1402
|
session=session,
|
|
1373
1403
|
proxy_info=proxy_info,
|
|
1374
1404
|
send_request=self._prepare_send_request_function(session, proxy_info),
|
|
@@ -1385,32 +1415,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1385
1415
|
try:
|
|
1386
1416
|
request.state = RequestState.REQUEST_HANDLER
|
|
1387
1417
|
|
|
1388
|
-
self._check_request_collision(context.request, context.session)
|
|
1389
|
-
|
|
1390
1418
|
try:
|
|
1391
|
-
|
|
1419
|
+
with swapped_context(context, request):
|
|
1420
|
+
self._check_request_collision(request, session)
|
|
1421
|
+
await self._run_request_handler(context=context)
|
|
1392
1422
|
except asyncio.TimeoutError as e:
|
|
1393
1423
|
raise RequestHandlerError(e, context) from e
|
|
1394
1424
|
|
|
1395
1425
|
await self._commit_request_handler_result(context)
|
|
1396
|
-
await wait_for(
|
|
1397
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1398
|
-
timeout=self._internal_timeout,
|
|
1399
|
-
timeout_message='Marking request as handled timed out after '
|
|
1400
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1401
|
-
logger=self._logger,
|
|
1402
|
-
max_retries=3,
|
|
1403
|
-
)
|
|
1404
1426
|
|
|
1405
1427
|
request.state = RequestState.DONE
|
|
1406
1428
|
|
|
1407
|
-
|
|
1408
|
-
|
|
1429
|
+
await self._mark_request_as_handled(request)
|
|
1430
|
+
|
|
1431
|
+
if session and session.is_usable:
|
|
1432
|
+
session.mark_good()
|
|
1409
1433
|
|
|
1410
1434
|
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1411
1435
|
|
|
1412
1436
|
except RequestCollisionError as request_error:
|
|
1413
|
-
|
|
1437
|
+
request.no_retry = True
|
|
1414
1438
|
await self._handle_request_error(context, request_error)
|
|
1415
1439
|
|
|
1416
1440
|
except RequestHandlerError as primary_error:
|
|
@@ -1425,7 +1449,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1425
1449
|
await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
|
|
1426
1450
|
|
|
1427
1451
|
except SessionError as session_error:
|
|
1428
|
-
if not
|
|
1452
|
+
if not session:
|
|
1429
1453
|
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
|
|
1430
1454
|
|
|
1431
1455
|
if self._error_handler:
|
|
@@ -1435,22 +1459,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1435
1459
|
exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
|
|
1436
1460
|
self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
|
|
1437
1461
|
|
|
1438
|
-
|
|
1462
|
+
if session:
|
|
1463
|
+
session.retire()
|
|
1439
1464
|
|
|
1440
1465
|
# Increment session rotation count.
|
|
1441
|
-
|
|
1466
|
+
request.session_rotation_count = (request.session_rotation_count or 0) + 1
|
|
1442
1467
|
|
|
1443
1468
|
await request_manager.reclaim_request(request)
|
|
1444
1469
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1445
1470
|
else:
|
|
1446
|
-
await
|
|
1447
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1448
|
-
timeout=self._internal_timeout,
|
|
1449
|
-
timeout_message='Marking request as handled timed out after '
|
|
1450
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1451
|
-
logger=self._logger,
|
|
1452
|
-
max_retries=3,
|
|
1453
|
-
)
|
|
1471
|
+
await self._mark_request_as_handled(request)
|
|
1454
1472
|
|
|
1455
1473
|
await self._handle_failed_request(context, session_error)
|
|
1456
1474
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1458,14 +1476,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1458
1476
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1459
1477
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1460
1478
|
|
|
1461
|
-
await
|
|
1462
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1463
|
-
timeout=self._internal_timeout,
|
|
1464
|
-
timeout_message='Marking request as handled timed out after '
|
|
1465
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1466
|
-
logger=self._logger,
|
|
1467
|
-
max_retries=3,
|
|
1468
|
-
)
|
|
1479
|
+
await self._mark_request_as_handled(request)
|
|
1469
1480
|
|
|
1470
1481
|
except ContextPipelineInitializationError as initialization_error:
|
|
1471
1482
|
self._logger.debug(
|
|
@@ -1483,12 +1494,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1483
1494
|
raise
|
|
1484
1495
|
|
|
1485
1496
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1497
|
+
context.request.state = RequestState.BEFORE_NAV
|
|
1498
|
+
await self._context_pipeline(
|
|
1499
|
+
context,
|
|
1500
|
+
lambda final_context: wait_for(
|
|
1501
|
+
lambda: self.router(final_context),
|
|
1502
|
+
timeout=self._request_handler_timeout,
|
|
1503
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1504
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1505
|
+
logger=self._logger,
|
|
1506
|
+
),
|
|
1492
1507
|
)
|
|
1493
1508
|
|
|
1494
1509
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1636,3 +1651,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1636
1651
|
)
|
|
1637
1652
|
|
|
1638
1653
|
self._previous_crawler_state = current_state
|
|
1654
|
+
|
|
1655
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1656
|
+
request_manager = await self.get_request_manager()
|
|
1657
|
+
await wait_for(
|
|
1658
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1659
|
+
timeout=self._internal_timeout,
|
|
1660
|
+
timeout_message='Marking request as handled timed out after '
|
|
1661
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1662
|
+
logger=self._logger,
|
|
1663
|
+
max_retries=3,
|
|
1664
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
from crawlee._request import Request
|
|
10
|
+
|
|
11
|
+
from ._basic_crawling_context import BasicCrawlingContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def swapped_context(
|
|
16
|
+
context: BasicCrawlingContext,
|
|
17
|
+
request: Request,
|
|
18
|
+
) -> Iterator[None]:
|
|
19
|
+
"""Replace context's isolated copies with originals after handler execution."""
|
|
20
|
+
try:
|
|
21
|
+
yield
|
|
22
|
+
finally:
|
|
23
|
+
# Restore original context state to avoid side effects between different handlers.
|
|
24
|
+
object.__setattr__(context, 'request', request)
|