crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -27,9 +29,12 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
|
|
|
27
29
|
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
|
|
28
30
|
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
|
|
29
31
|
from crawlee._request import Request, RequestOptions, RequestState
|
|
32
|
+
from crawlee._service_locator import ServiceLocator
|
|
30
33
|
from crawlee._types import (
|
|
31
34
|
BasicCrawlingContext,
|
|
32
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
33
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
34
39
|
HttpHeaders,
|
|
35
40
|
HttpPayload,
|
|
@@ -39,7 +44,7 @@ from crawlee._types import (
|
|
|
39
44
|
SkippedReason,
|
|
40
45
|
)
|
|
41
46
|
from crawlee._utils.docs import docs_group
|
|
42
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
43
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
44
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
45
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -54,6 +59,7 @@ from crawlee.errors import (
|
|
|
54
59
|
RequestHandlerError,
|
|
55
60
|
SessionError,
|
|
56
61
|
UserDefinedErrorHandlerError,
|
|
62
|
+
UserHandlerTimeoutError,
|
|
57
63
|
)
|
|
58
64
|
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
59
65
|
from crawlee.http_clients import ImpitHttpClient
|
|
@@ -63,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
|
|
|
63
69
|
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
64
70
|
|
|
65
71
|
from ._context_pipeline import ContextPipeline
|
|
72
|
+
from ._context_utils import swapped_context
|
|
66
73
|
from ._logging_utils import (
|
|
67
74
|
get_one_line_error_summary_if_possible,
|
|
68
75
|
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
|
|
@@ -95,6 +102,9 @@ if TYPE_CHECKING:
|
|
|
95
102
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
96
103
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
97
104
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
105
|
+
TParams = ParamSpec('TParams')
|
|
106
|
+
T = TypeVar('T')
|
|
107
|
+
|
|
98
108
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
99
109
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
100
110
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -204,7 +214,7 @@ class _BasicCrawlerOptions(TypedDict):
|
|
|
204
214
|
Returning `None` suppresses the status message."""
|
|
205
215
|
|
|
206
216
|
|
|
207
|
-
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState]
|
|
217
|
+
class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
|
|
208
218
|
"""Generic options the `BasicCrawler` constructor."""
|
|
209
219
|
|
|
210
220
|
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
|
|
@@ -219,9 +229,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
|
|
|
219
229
|
|
|
220
230
|
|
|
221
231
|
class BasicCrawlerOptions(
|
|
222
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
223
232
|
_BasicCrawlerOptions,
|
|
224
233
|
_BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
|
|
234
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
225
235
|
):
|
|
226
236
|
"""Arguments for the `BasicCrawler` constructor.
|
|
227
237
|
|
|
@@ -346,14 +356,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
346
356
|
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
|
|
347
357
|
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
|
|
348
358
|
"""
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
service_locator.set_event_manager(event_manager)
|
|
359
|
+
implicit_event_manager_with_explicit_config = False
|
|
360
|
+
if not configuration:
|
|
361
|
+
configuration = service_locator.get_configuration()
|
|
362
|
+
elif not event_manager:
|
|
363
|
+
implicit_event_manager_with_explicit_config = True
|
|
355
364
|
|
|
356
|
-
|
|
365
|
+
if not storage_client:
|
|
366
|
+
storage_client = service_locator.get_storage_client()
|
|
367
|
+
|
|
368
|
+
if not event_manager:
|
|
369
|
+
event_manager = service_locator.get_event_manager()
|
|
370
|
+
|
|
371
|
+
self._service_locator = ServiceLocator(
|
|
372
|
+
configuration=configuration, storage_client=storage_client, event_manager=event_manager
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
config = self._service_locator.get_configuration()
|
|
357
376
|
|
|
358
377
|
# Core components
|
|
359
378
|
self._request_manager = request_manager
|
|
@@ -391,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
391
410
|
self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
|
|
392
411
|
|
|
393
412
|
# Context pipeline
|
|
394
|
-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
|
|
413
|
+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
|
|
395
414
|
|
|
396
415
|
# Crawl settings
|
|
397
416
|
self._max_request_retries = max_request_retries
|
|
@@ -419,17 +438,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
419
438
|
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
|
|
420
439
|
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
|
|
421
440
|
self._logger = _logger or logging.getLogger(__name__)
|
|
441
|
+
if implicit_event_manager_with_explicit_config:
|
|
442
|
+
self._logger.warning(
|
|
443
|
+
'No event manager set, implicitly using event manager from global service_locator.'
|
|
444
|
+
'It is advised to explicitly set the event manager if explicit configuration is used as well.'
|
|
445
|
+
)
|
|
422
446
|
self._statistics_log_format = statistics_log_format
|
|
423
447
|
|
|
424
448
|
# Statistics
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
449
|
+
if statistics:
|
|
450
|
+
self._statistics = statistics
|
|
451
|
+
else:
|
|
452
|
+
|
|
453
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
454
|
+
return await self.get_key_value_store()
|
|
455
|
+
|
|
456
|
+
self._statistics = cast(
|
|
457
|
+
'Statistics[TStatisticsState]',
|
|
458
|
+
Statistics.with_default_state(
|
|
459
|
+
persistence_enabled=True,
|
|
460
|
+
periodic_message_logger=self._logger,
|
|
461
|
+
statistics_log_format=self._statistics_log_format,
|
|
462
|
+
log_message='Current request statistics:',
|
|
463
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
464
|
+
),
|
|
465
|
+
)
|
|
433
466
|
|
|
434
467
|
# Additional context managers to enter and exit
|
|
435
468
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -496,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
496
529
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
497
530
|
self._unexpected_stop = True
|
|
498
531
|
|
|
532
|
+
def _wrap_handler_with_error_context(
|
|
533
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
534
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
535
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
536
|
+
|
|
537
|
+
@functools.wraps(handler)
|
|
538
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
539
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
|
|
540
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
541
|
+
error_context = context.create_modified_copy(
|
|
542
|
+
push_data=self._push_data,
|
|
543
|
+
get_key_value_store=self.get_key_value_store,
|
|
544
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
545
|
+
)
|
|
546
|
+
return await handler(error_context, exception)
|
|
547
|
+
|
|
548
|
+
return wrapped_handler
|
|
549
|
+
|
|
499
550
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
500
551
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
501
552
|
if self._max_requests_per_crawl is None:
|
|
@@ -548,7 +599,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
548
599
|
async def get_request_manager(self) -> RequestManager:
|
|
549
600
|
"""Return the configured request manager. If none is configured, open and return the default request queue."""
|
|
550
601
|
if not self._request_manager:
|
|
551
|
-
self._request_manager = await RequestQueue.open(
|
|
602
|
+
self._request_manager = await RequestQueue.open(
|
|
603
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
604
|
+
configuration=self._service_locator.get_configuration(),
|
|
605
|
+
)
|
|
552
606
|
|
|
553
607
|
return self._request_manager
|
|
554
608
|
|
|
@@ -557,18 +611,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
557
611
|
*,
|
|
558
612
|
id: str | None = None,
|
|
559
613
|
name: str | None = None,
|
|
614
|
+
alias: str | None = None,
|
|
560
615
|
) -> Dataset:
|
|
561
616
|
"""Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
|
|
562
|
-
return await Dataset.open(
|
|
617
|
+
return await Dataset.open(
|
|
618
|
+
id=id,
|
|
619
|
+
name=name,
|
|
620
|
+
alias=alias,
|
|
621
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
622
|
+
configuration=self._service_locator.get_configuration(),
|
|
623
|
+
)
|
|
563
624
|
|
|
564
625
|
async def get_key_value_store(
|
|
565
626
|
self,
|
|
566
627
|
*,
|
|
567
628
|
id: str | None = None,
|
|
568
629
|
name: str | None = None,
|
|
630
|
+
alias: str | None = None,
|
|
569
631
|
) -> KeyValueStore:
|
|
570
632
|
"""Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
|
|
571
|
-
return await KeyValueStore.open(
|
|
633
|
+
return await KeyValueStore.open(
|
|
634
|
+
id=id,
|
|
635
|
+
name=name,
|
|
636
|
+
alias=alias,
|
|
637
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
638
|
+
configuration=self._service_locator.get_configuration(),
|
|
639
|
+
)
|
|
572
640
|
|
|
573
641
|
def error_handler(
|
|
574
642
|
self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
|
|
@@ -577,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
577
645
|
|
|
578
646
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
579
647
|
"""
|
|
580
|
-
self._error_handler = handler
|
|
648
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
581
649
|
return handler
|
|
582
650
|
|
|
583
651
|
def failed_request_handler(
|
|
@@ -587,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
587
655
|
|
|
588
656
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
589
657
|
"""
|
|
590
|
-
self._failed_request_handler = handler
|
|
658
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
591
659
|
return handler
|
|
592
660
|
|
|
593
661
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -627,7 +695,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
627
695
|
request_manager = await self.get_request_manager()
|
|
628
696
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
629
697
|
await request_manager.drop()
|
|
630
|
-
self._request_manager = await RequestQueue.open(
|
|
698
|
+
self._request_manager = await RequestQueue.open(
|
|
699
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
700
|
+
configuration=self._service_locator.get_configuration(),
|
|
701
|
+
)
|
|
631
702
|
|
|
632
703
|
if requests is not None:
|
|
633
704
|
await self.add_requests(requests)
|
|
@@ -654,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
654
725
|
except CancelledError:
|
|
655
726
|
pass
|
|
656
727
|
finally:
|
|
657
|
-
await self._crawler_state_rec_task.stop()
|
|
658
728
|
if threading.current_thread() is threading.main_thread():
|
|
659
729
|
with suppress(NotImplementedError):
|
|
660
730
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -684,9 +754,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
684
754
|
return final_statistics
|
|
685
755
|
|
|
686
756
|
async def _run_crawler(self) -> None:
|
|
687
|
-
event_manager =
|
|
688
|
-
|
|
689
|
-
self._crawler_state_rec_task.start()
|
|
757
|
+
event_manager = self._service_locator.get_event_manager()
|
|
690
758
|
|
|
691
759
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
692
760
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
@@ -698,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
698
766
|
self._statistics,
|
|
699
767
|
self._session_pool if self._use_session_pool else None,
|
|
700
768
|
self._http_client,
|
|
769
|
+
self._crawler_state_rec_task,
|
|
701
770
|
*self._additional_context_managers,
|
|
702
771
|
)
|
|
703
772
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -705,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
705
774
|
|
|
706
775
|
async with AsyncExitStack() as exit_stack:
|
|
707
776
|
for context in contexts_to_enter:
|
|
708
|
-
await exit_stack.enter_async_context(context) #
|
|
777
|
+
await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
|
|
709
778
|
|
|
710
779
|
await self._autoscaled_pool.run()
|
|
711
780
|
|
|
@@ -772,6 +841,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
772
841
|
self,
|
|
773
842
|
dataset_id: str | None = None,
|
|
774
843
|
dataset_name: str | None = None,
|
|
844
|
+
dataset_alias: str | None = None,
|
|
775
845
|
**kwargs: Unpack[GetDataKwargs],
|
|
776
846
|
) -> DatasetItemsListPage:
|
|
777
847
|
"""Retrieve data from a `Dataset`.
|
|
@@ -781,13 +851,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
781
851
|
|
|
782
852
|
Args:
|
|
783
853
|
dataset_id: The ID of the `Dataset`.
|
|
784
|
-
dataset_name: The name of the `Dataset
|
|
854
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
855
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
785
856
|
kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
|
|
786
857
|
|
|
787
858
|
Returns:
|
|
788
859
|
The retrieved data.
|
|
789
860
|
"""
|
|
790
|
-
dataset = await Dataset.open(
|
|
861
|
+
dataset = await Dataset.open(
|
|
862
|
+
id=dataset_id,
|
|
863
|
+
name=dataset_name,
|
|
864
|
+
alias=dataset_alias,
|
|
865
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
866
|
+
configuration=self._service_locator.get_configuration(),
|
|
867
|
+
)
|
|
791
868
|
return await dataset.get_data(**kwargs)
|
|
792
869
|
|
|
793
870
|
async def export_data(
|
|
@@ -795,6 +872,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
795
872
|
path: str | Path,
|
|
796
873
|
dataset_id: str | None = None,
|
|
797
874
|
dataset_name: str | None = None,
|
|
875
|
+
dataset_alias: str | None = None,
|
|
876
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
|
|
798
877
|
) -> None:
|
|
799
878
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
800
879
|
|
|
@@ -804,18 +883,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
804
883
|
|
|
805
884
|
Args:
|
|
806
885
|
path: The destination file path. Must end with '.json' or '.csv'.
|
|
807
|
-
dataset_id: The ID of the Dataset to export from.
|
|
808
|
-
dataset_name: The name of the Dataset to export from
|
|
886
|
+
dataset_id: The ID of the Dataset to export from.
|
|
887
|
+
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
888
|
+
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
889
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
809
890
|
"""
|
|
810
|
-
dataset = await
|
|
891
|
+
dataset = await Dataset.open(
|
|
892
|
+
id=dataset_id,
|
|
893
|
+
name=dataset_name,
|
|
894
|
+
alias=dataset_alias,
|
|
895
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
896
|
+
configuration=self._service_locator.get_configuration(),
|
|
897
|
+
)
|
|
811
898
|
|
|
812
|
-
path =
|
|
813
|
-
dst = path.open('w', newline='')
|
|
899
|
+
path = Path(path)
|
|
814
900
|
|
|
815
901
|
if path.suffix == '.csv':
|
|
816
|
-
|
|
902
|
+
dst = StringIO()
|
|
903
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
904
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
905
|
+
await atomic_write(path, dst.getvalue())
|
|
817
906
|
elif path.suffix == '.json':
|
|
818
|
-
|
|
907
|
+
dst = StringIO()
|
|
908
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
909
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
910
|
+
await atomic_write(path, dst.getvalue())
|
|
819
911
|
else:
|
|
820
912
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
821
913
|
|
|
@@ -824,6 +916,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
824
916
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
825
917
|
dataset_id: str | None = None,
|
|
826
918
|
dataset_name: str | None = None,
|
|
919
|
+
dataset_alias: str | None = None,
|
|
827
920
|
**kwargs: Unpack[PushDataKwargs],
|
|
828
921
|
) -> None:
|
|
829
922
|
"""Push data to a `Dataset`.
|
|
@@ -834,10 +927,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
834
927
|
Args:
|
|
835
928
|
data: The data to push to the `Dataset`.
|
|
836
929
|
dataset_id: The ID of the `Dataset`.
|
|
837
|
-
dataset_name: The name of the `Dataset
|
|
930
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
931
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
838
932
|
kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
|
|
839
933
|
"""
|
|
840
|
-
dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
|
|
934
|
+
dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
|
|
841
935
|
await dataset.push_data(data, **kwargs)
|
|
842
936
|
|
|
843
937
|
def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
|
|
@@ -894,6 +988,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
894
988
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
895
989
|
| None = None,
|
|
896
990
|
requests: Sequence[str | Request] | None = None,
|
|
991
|
+
rq_id: str | None = None,
|
|
992
|
+
rq_name: str | None = None,
|
|
993
|
+
rq_alias: str | None = None,
|
|
897
994
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
898
995
|
) -> None:
|
|
899
996
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -905,7 +1002,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
905
1002
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
906
1003
|
)
|
|
907
1004
|
# Add directly passed requests.
|
|
908
|
-
await context.add_requests(
|
|
1005
|
+
await context.add_requests(
|
|
1006
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
1007
|
+
)
|
|
909
1008
|
else:
|
|
910
1009
|
# Add requests from extracted links.
|
|
911
1010
|
await context.add_requests(
|
|
@@ -914,7 +1013,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
914
1013
|
label=label,
|
|
915
1014
|
user_data=user_data,
|
|
916
1015
|
transform_request_function=transform_request_function,
|
|
1016
|
+
**kwargs,
|
|
917
1017
|
),
|
|
1018
|
+
rq_id=rq_id,
|
|
1019
|
+
rq_name=rq_name,
|
|
1020
|
+
rq_alias=rq_alias,
|
|
918
1021
|
**kwargs,
|
|
919
1022
|
)
|
|
920
1023
|
|
|
@@ -936,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
936
1039
|
warning_flag = True
|
|
937
1040
|
|
|
938
1041
|
for request in request_iterator:
|
|
939
|
-
|
|
1042
|
+
if isinstance(request, Request):
|
|
1043
|
+
if request.enqueue_strategy != strategy:
|
|
1044
|
+
request.enqueue_strategy = strategy
|
|
1045
|
+
target_url = request.url
|
|
1046
|
+
else:
|
|
1047
|
+
target_url = request
|
|
940
1048
|
parsed_target_url = urlparse(target_url)
|
|
941
1049
|
|
|
942
1050
|
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
|
|
@@ -948,9 +1056,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
948
1056
|
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
|
|
949
1057
|
yield request
|
|
950
1058
|
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
1059
|
+
if limit is not None:
|
|
1060
|
+
limit -= 1
|
|
1061
|
+
if limit <= 0:
|
|
1062
|
+
break
|
|
954
1063
|
|
|
955
1064
|
def _check_enqueue_strategy(
|
|
956
1065
|
self,
|
|
@@ -974,8 +1083,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
974
1083
|
return target_url.hostname == origin_url.hostname
|
|
975
1084
|
|
|
976
1085
|
if strategy == 'same-domain':
|
|
977
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
978
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1086
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1087
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
979
1088
|
return origin_domain == target_domain
|
|
980
1089
|
|
|
981
1090
|
if strategy == 'same-origin':
|
|
@@ -1031,8 +1140,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1031
1140
|
|
|
1032
1141
|
if self._should_retry_request(context, error):
|
|
1033
1142
|
request.retry_count += 1
|
|
1143
|
+
reduced_error = str(error).split('\n')[0]
|
|
1034
1144
|
self.log.warning(
|
|
1035
|
-
f'Retrying request to {context.request.url} due to: {
|
|
1145
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
|
|
1036
1146
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1037
1147
|
)
|
|
1038
1148
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1043,21 +1153,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1043
1153
|
except Exception as e:
|
|
1044
1154
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1045
1155
|
else:
|
|
1046
|
-
if new_request is not None:
|
|
1047
|
-
|
|
1156
|
+
if new_request is not None and new_request != request:
|
|
1157
|
+
await request_manager.add_request(new_request)
|
|
1158
|
+
await self._mark_request_as_handled(request)
|
|
1159
|
+
return
|
|
1048
1160
|
|
|
1049
1161
|
await request_manager.reclaim_request(request)
|
|
1050
1162
|
else:
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
timeout=self._internal_timeout,
|
|
1054
|
-
timeout_message='Marking request as handled timed out after '
|
|
1055
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1056
|
-
logger=self._logger,
|
|
1057
|
-
max_retries=3,
|
|
1058
|
-
)
|
|
1163
|
+
request.state = RequestState.ERROR
|
|
1164
|
+
await self._mark_request_as_handled(request)
|
|
1059
1165
|
await self._handle_failed_request(context, error)
|
|
1060
|
-
self._statistics.record_request_processing_failure(request.
|
|
1166
|
+
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1061
1167
|
|
|
1062
1168
|
async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
|
|
1063
1169
|
try:
|
|
@@ -1070,8 +1176,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1070
1176
|
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1071
1177
|
logger=self._logger,
|
|
1072
1178
|
)
|
|
1073
|
-
|
|
1074
|
-
context.request.state = RequestState.DONE
|
|
1075
1179
|
except UserDefinedErrorHandlerError:
|
|
1076
1180
|
context.request.state = RequestState.ERROR
|
|
1077
1181
|
raise
|
|
@@ -1104,17 +1208,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1104
1208
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1105
1209
|
) -> None:
|
|
1106
1210
|
if need_mark and isinstance(request, Request):
|
|
1107
|
-
request_manager = await self.get_request_manager()
|
|
1108
|
-
|
|
1109
|
-
await wait_for(
|
|
1110
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1111
|
-
timeout=self._internal_timeout,
|
|
1112
|
-
timeout_message='Marking request as handled timed out after '
|
|
1113
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1114
|
-
logger=self._logger,
|
|
1115
|
-
max_retries=3,
|
|
1116
|
-
)
|
|
1117
1211
|
request.state = RequestState.SKIPPED
|
|
1212
|
+
await self._mark_request_as_handled(request)
|
|
1118
1213
|
|
|
1119
1214
|
url = request.url if isinstance(request, Request) else request
|
|
1120
1215
|
|
|
@@ -1134,10 +1229,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1134
1229
|
|
|
1135
1230
|
if (
|
|
1136
1231
|
isinstance(error, asyncio.exceptions.TimeoutError)
|
|
1232
|
+
and traceback_parts
|
|
1137
1233
|
and self._request_handler_timeout_text in traceback_parts[-1]
|
|
1138
|
-
):
|
|
1234
|
+
) or isinstance(error, UserHandlerTimeoutError):
|
|
1139
1235
|
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
1140
|
-
used_traceback_parts.
|
|
1236
|
+
used_traceback_parts.extend(traceback_parts[-1:])
|
|
1141
1237
|
|
|
1142
1238
|
return ''.join(used_traceback_parts).strip('\n')
|
|
1143
1239
|
|
|
@@ -1186,47 +1282,61 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1186
1282
|
else:
|
|
1187
1283
|
yield Request.from_url(url)
|
|
1188
1284
|
|
|
1189
|
-
async def
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1285
|
+
async def _add_requests(
|
|
1286
|
+
self,
|
|
1287
|
+
context: BasicCrawlingContext,
|
|
1288
|
+
requests: Sequence[str | Request],
|
|
1289
|
+
rq_id: str | None = None,
|
|
1290
|
+
rq_name: str | None = None,
|
|
1291
|
+
rq_alias: str | None = None,
|
|
1292
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1293
|
+
) -> None:
|
|
1294
|
+
"""Add requests method aware of the crawling context."""
|
|
1295
|
+
if rq_id or rq_name or rq_alias:
|
|
1296
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1297
|
+
id=rq_id,
|
|
1298
|
+
name=rq_name,
|
|
1299
|
+
alias=rq_alias,
|
|
1300
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1301
|
+
configuration=self._service_locator.get_configuration(),
|
|
1302
|
+
)
|
|
1303
|
+
else:
|
|
1304
|
+
request_manager = await self.get_request_manager()
|
|
1202
1305
|
|
|
1203
|
-
|
|
1306
|
+
context_aware_requests = list[Request]()
|
|
1307
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1308
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1309
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1310
|
+
for dst_request in filter_requests_iterator:
|
|
1311
|
+
# Update the crawl depth of the request.
|
|
1312
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1204
1313
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
)
|
|
1314
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1315
|
+
context_aware_requests.append(dst_request)
|
|
1208
1316
|
|
|
1209
|
-
|
|
1210
|
-
# Update the crawl depth of the request.
|
|
1211
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1317
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1212
1318
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1319
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1320
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1321
|
+
result = self._context_result_map[context]
|
|
1215
1322
|
|
|
1216
|
-
|
|
1323
|
+
for add_requests_call in result.add_requests_calls:
|
|
1324
|
+
await self._add_requests(context, **add_requests_call)
|
|
1217
1325
|
|
|
1218
1326
|
for push_data_call in result.push_data_calls:
|
|
1219
1327
|
await self._push_data(**push_data_call)
|
|
1220
1328
|
|
|
1221
1329
|
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
|
|
1222
1330
|
|
|
1331
|
+
result.apply_request_changes(target=context.request)
|
|
1332
|
+
|
|
1223
1333
|
@staticmethod
|
|
1224
1334
|
async def _commit_key_value_store_changes(
|
|
1225
1335
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
1226
1336
|
) -> None:
|
|
1227
1337
|
"""Store key value store changes recorded in result."""
|
|
1228
|
-
for (id, name), changes in result.key_value_store_changes.items():
|
|
1229
|
-
store = await get_kvs(id=id, name=name)
|
|
1338
|
+
for (id, name, alias), changes in result.key_value_store_changes.items():
|
|
1339
|
+
store = await get_kvs(id=id, name=name, alias=alias)
|
|
1230
1340
|
for key, value in changes.updates.items():
|
|
1231
1341
|
await store.set_value(key, value.content, value.content_type)
|
|
1232
1342
|
|
|
@@ -1274,7 +1384,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1274
1384
|
|
|
1275
1385
|
if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
|
|
1276
1386
|
self._logger.warning(
|
|
1277
|
-
f'Skipping request {request.url} ({request.
|
|
1387
|
+
f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
|
|
1278
1388
|
)
|
|
1279
1389
|
|
|
1280
1390
|
await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
|
|
@@ -1285,10 +1395,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1285
1395
|
else:
|
|
1286
1396
|
session = await self._get_session()
|
|
1287
1397
|
proxy_info = await self._get_proxy_info(request, session)
|
|
1288
|
-
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
|
|
1398
|
+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
|
|
1289
1399
|
|
|
1290
1400
|
context = BasicCrawlingContext(
|
|
1291
|
-
request=request,
|
|
1401
|
+
request=result.request,
|
|
1292
1402
|
session=session,
|
|
1293
1403
|
proxy_info=proxy_info,
|
|
1294
1404
|
send_request=self._prepare_send_request_function(session, proxy_info),
|
|
@@ -1300,38 +1410,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1300
1410
|
)
|
|
1301
1411
|
self._context_result_map[context] = result
|
|
1302
1412
|
|
|
1303
|
-
|
|
1304
|
-
self._statistics.record_request_processing_start(statistics_id)
|
|
1413
|
+
self._statistics.record_request_processing_start(request.unique_key)
|
|
1305
1414
|
|
|
1306
1415
|
try:
|
|
1307
1416
|
request.state = RequestState.REQUEST_HANDLER
|
|
1308
1417
|
|
|
1309
|
-
self._check_request_collision(context.request, context.session)
|
|
1310
|
-
|
|
1311
1418
|
try:
|
|
1312
|
-
|
|
1419
|
+
with swapped_context(context, request):
|
|
1420
|
+
self._check_request_collision(request, session)
|
|
1421
|
+
await self._run_request_handler(context=context)
|
|
1313
1422
|
except asyncio.TimeoutError as e:
|
|
1314
1423
|
raise RequestHandlerError(e, context) from e
|
|
1315
1424
|
|
|
1316
1425
|
await self._commit_request_handler_result(context)
|
|
1317
|
-
await wait_for(
|
|
1318
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1319
|
-
timeout=self._internal_timeout,
|
|
1320
|
-
timeout_message='Marking request as handled timed out after '
|
|
1321
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1322
|
-
logger=self._logger,
|
|
1323
|
-
max_retries=3,
|
|
1324
|
-
)
|
|
1325
1426
|
|
|
1326
1427
|
request.state = RequestState.DONE
|
|
1327
1428
|
|
|
1328
|
-
|
|
1329
|
-
context.session.mark_good()
|
|
1429
|
+
await self._mark_request_as_handled(request)
|
|
1330
1430
|
|
|
1331
|
-
|
|
1431
|
+
if session and session.is_usable:
|
|
1432
|
+
session.mark_good()
|
|
1433
|
+
|
|
1434
|
+
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1332
1435
|
|
|
1333
1436
|
except RequestCollisionError as request_error:
|
|
1334
|
-
|
|
1437
|
+
request.no_retry = True
|
|
1335
1438
|
await self._handle_request_error(context, request_error)
|
|
1336
1439
|
|
|
1337
1440
|
except RequestHandlerError as primary_error:
|
|
@@ -1346,46 +1449,34 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1346
1449
|
await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
|
|
1347
1450
|
|
|
1348
1451
|
except SessionError as session_error:
|
|
1349
|
-
if not
|
|
1452
|
+
if not session:
|
|
1350
1453
|
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
|
|
1351
1454
|
|
|
1352
1455
|
if self._error_handler:
|
|
1353
1456
|
await self._error_handler(context, session_error)
|
|
1354
1457
|
|
|
1355
1458
|
if self._should_retry_request(context, session_error):
|
|
1356
|
-
|
|
1459
|
+
exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
|
|
1460
|
+
self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
|
|
1357
1461
|
|
|
1358
|
-
|
|
1462
|
+
if session:
|
|
1463
|
+
session.retire()
|
|
1359
1464
|
|
|
1360
1465
|
# Increment session rotation count.
|
|
1361
|
-
|
|
1466
|
+
request.session_rotation_count = (request.session_rotation_count or 0) + 1
|
|
1362
1467
|
|
|
1363
1468
|
await request_manager.reclaim_request(request)
|
|
1364
1469
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1365
1470
|
else:
|
|
1366
|
-
await
|
|
1367
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1368
|
-
timeout=self._internal_timeout,
|
|
1369
|
-
timeout_message='Marking request as handled timed out after '
|
|
1370
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1371
|
-
logger=self._logger,
|
|
1372
|
-
max_retries=3,
|
|
1373
|
-
)
|
|
1471
|
+
await self._mark_request_as_handled(request)
|
|
1374
1472
|
|
|
1375
1473
|
await self._handle_failed_request(context, session_error)
|
|
1376
|
-
self._statistics.record_request_processing_failure(
|
|
1474
|
+
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1377
1475
|
|
|
1378
1476
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1379
1477
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1380
1478
|
|
|
1381
|
-
await
|
|
1382
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1383
|
-
timeout=self._internal_timeout,
|
|
1384
|
-
timeout_message='Marking request as handled timed out after '
|
|
1385
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1386
|
-
logger=self._logger,
|
|
1387
|
-
max_retries=3,
|
|
1388
|
-
)
|
|
1479
|
+
await self._mark_request_as_handled(request)
|
|
1389
1480
|
|
|
1390
1481
|
except ContextPipelineInitializationError as initialization_error:
|
|
1391
1482
|
self._logger.debug(
|
|
@@ -1403,12 +1494,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1403
1494
|
raise
|
|
1404
1495
|
|
|
1405
1496
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1497
|
+
context.request.state = RequestState.BEFORE_NAV
|
|
1498
|
+
await self._context_pipeline(
|
|
1499
|
+
context,
|
|
1500
|
+
lambda final_context: wait_for(
|
|
1501
|
+
lambda: self.router(final_context),
|
|
1502
|
+
timeout=self._request_handler_timeout,
|
|
1503
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1504
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1505
|
+
logger=self._logger,
|
|
1506
|
+
),
|
|
1412
1507
|
)
|
|
1413
1508
|
|
|
1414
1509
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1519,7 +1614,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1519
1614
|
|
|
1520
1615
|
async def _crawler_state_task(self) -> None:
|
|
1521
1616
|
"""Emit a persist state event with the given migration status."""
|
|
1522
|
-
event_manager =
|
|
1617
|
+
event_manager = self._service_locator.get_event_manager()
|
|
1523
1618
|
|
|
1524
1619
|
current_state = self.statistics.state
|
|
1525
1620
|
|
|
@@ -1556,3 +1651,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1556
1651
|
)
|
|
1557
1652
|
|
|
1558
1653
|
self._previous_crawler_state = current_state
|
|
1654
|
+
|
|
1655
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1656
|
+
request_manager = await self.get_request_manager()
|
|
1657
|
+
await wait_for(
|
|
1658
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1659
|
+
timeout=self._internal_timeout,
|
|
1660
|
+
timeout_message='Marking request as handled timed out after '
|
|
1661
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1662
|
+
logger=self._logger,
|
|
1663
|
+
max_retries=3,
|
|
1664
|
+
)
|