crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -27,9 +29,12 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
|
|
|
27
29
|
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
|
|
28
30
|
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
|
|
29
31
|
from crawlee._request import Request, RequestOptions, RequestState
|
|
32
|
+
from crawlee._service_locator import ServiceLocator
|
|
30
33
|
from crawlee._types import (
|
|
31
34
|
BasicCrawlingContext,
|
|
32
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
33
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
34
39
|
HttpHeaders,
|
|
35
40
|
HttpPayload,
|
|
@@ -39,7 +44,7 @@ from crawlee._types import (
|
|
|
39
44
|
SkippedReason,
|
|
40
45
|
)
|
|
41
46
|
from crawlee._utils.docs import docs_group
|
|
42
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
43
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
44
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
45
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -95,6 +100,9 @@ if TYPE_CHECKING:
|
|
|
95
100
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
96
101
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
97
102
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
103
|
+
TParams = ParamSpec('TParams')
|
|
104
|
+
T = TypeVar('T')
|
|
105
|
+
|
|
98
106
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
99
107
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
100
108
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -204,7 +212,7 @@ class _BasicCrawlerOptions(TypedDict):
|
|
|
204
212
|
Returning `None` suppresses the status message."""
|
|
205
213
|
|
|
206
214
|
|
|
207
|
-
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState]
|
|
215
|
+
class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
|
|
208
216
|
"""Generic options the `BasicCrawler` constructor."""
|
|
209
217
|
|
|
210
218
|
request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
|
|
@@ -219,9 +227,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
|
|
|
219
227
|
|
|
220
228
|
|
|
221
229
|
class BasicCrawlerOptions(
|
|
222
|
-
Generic[TCrawlingContext, TStatisticsState],
|
|
223
230
|
_BasicCrawlerOptions,
|
|
224
231
|
_BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
|
|
232
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
225
233
|
):
|
|
226
234
|
"""Arguments for the `BasicCrawler` constructor.
|
|
227
235
|
|
|
@@ -346,14 +354,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
346
354
|
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
|
|
347
355
|
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
|
|
348
356
|
"""
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
service_locator.set_event_manager(event_manager)
|
|
357
|
+
implicit_event_manager_with_explicit_config = False
|
|
358
|
+
if not configuration:
|
|
359
|
+
configuration = service_locator.get_configuration()
|
|
360
|
+
elif not event_manager:
|
|
361
|
+
implicit_event_manager_with_explicit_config = True
|
|
355
362
|
|
|
356
|
-
|
|
363
|
+
if not storage_client:
|
|
364
|
+
storage_client = service_locator.get_storage_client()
|
|
365
|
+
|
|
366
|
+
if not event_manager:
|
|
367
|
+
event_manager = service_locator.get_event_manager()
|
|
368
|
+
|
|
369
|
+
self._service_locator = ServiceLocator(
|
|
370
|
+
configuration=configuration, storage_client=storage_client, event_manager=event_manager
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
config = self._service_locator.get_configuration()
|
|
357
374
|
|
|
358
375
|
# Core components
|
|
359
376
|
self._request_manager = request_manager
|
|
@@ -419,17 +436,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
419
436
|
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
|
|
420
437
|
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
|
|
421
438
|
self._logger = _logger or logging.getLogger(__name__)
|
|
439
|
+
if implicit_event_manager_with_explicit_config:
|
|
440
|
+
self._logger.warning(
|
|
441
|
+
'No event manager set, implicitly using event manager from global service_locator.'
|
|
442
|
+
'It is advised to explicitly set the event manager if explicit configuration is used as well.'
|
|
443
|
+
)
|
|
422
444
|
self._statistics_log_format = statistics_log_format
|
|
423
445
|
|
|
424
446
|
# Statistics
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
447
|
+
if statistics:
|
|
448
|
+
self._statistics = statistics
|
|
449
|
+
else:
|
|
450
|
+
|
|
451
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
452
|
+
return await self.get_key_value_store()
|
|
453
|
+
|
|
454
|
+
self._statistics = cast(
|
|
455
|
+
'Statistics[TStatisticsState]',
|
|
456
|
+
Statistics.with_default_state(
|
|
457
|
+
persistence_enabled=True,
|
|
458
|
+
periodic_message_logger=self._logger,
|
|
459
|
+
statistics_log_format=self._statistics_log_format,
|
|
460
|
+
log_message='Current request statistics:',
|
|
461
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
462
|
+
),
|
|
463
|
+
)
|
|
433
464
|
|
|
434
465
|
# Additional context managers to enter and exit
|
|
435
466
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -496,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
496
527
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
497
528
|
self._unexpected_stop = True
|
|
498
529
|
|
|
530
|
+
def _wrap_handler_with_error_context(
|
|
531
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
532
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
533
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
534
|
+
|
|
535
|
+
@functools.wraps(handler)
|
|
536
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
537
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
538
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
539
|
+
error_context = context.create_modified_copy(
|
|
540
|
+
push_data=self._push_data,
|
|
541
|
+
get_key_value_store=self.get_key_value_store,
|
|
542
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
543
|
+
)
|
|
544
|
+
return await handler(error_context, exception)
|
|
545
|
+
|
|
546
|
+
return wrapped_handler
|
|
547
|
+
|
|
499
548
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
500
549
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
501
550
|
if self._max_requests_per_crawl is None:
|
|
@@ -548,7 +597,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
548
597
|
async def get_request_manager(self) -> RequestManager:
|
|
549
598
|
"""Return the configured request manager. If none is configured, open and return the default request queue."""
|
|
550
599
|
if not self._request_manager:
|
|
551
|
-
self._request_manager = await RequestQueue.open(
|
|
600
|
+
self._request_manager = await RequestQueue.open(
|
|
601
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
602
|
+
configuration=self._service_locator.get_configuration(),
|
|
603
|
+
)
|
|
552
604
|
|
|
553
605
|
return self._request_manager
|
|
554
606
|
|
|
@@ -557,18 +609,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
557
609
|
*,
|
|
558
610
|
id: str | None = None,
|
|
559
611
|
name: str | None = None,
|
|
612
|
+
alias: str | None = None,
|
|
560
613
|
) -> Dataset:
|
|
561
614
|
"""Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
|
|
562
|
-
return await Dataset.open(
|
|
615
|
+
return await Dataset.open(
|
|
616
|
+
id=id,
|
|
617
|
+
name=name,
|
|
618
|
+
alias=alias,
|
|
619
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
620
|
+
configuration=self._service_locator.get_configuration(),
|
|
621
|
+
)
|
|
563
622
|
|
|
564
623
|
async def get_key_value_store(
|
|
565
624
|
self,
|
|
566
625
|
*,
|
|
567
626
|
id: str | None = None,
|
|
568
627
|
name: str | None = None,
|
|
628
|
+
alias: str | None = None,
|
|
569
629
|
) -> KeyValueStore:
|
|
570
630
|
"""Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
|
|
571
|
-
return await KeyValueStore.open(
|
|
631
|
+
return await KeyValueStore.open(
|
|
632
|
+
id=id,
|
|
633
|
+
name=name,
|
|
634
|
+
alias=alias,
|
|
635
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
636
|
+
configuration=self._service_locator.get_configuration(),
|
|
637
|
+
)
|
|
572
638
|
|
|
573
639
|
def error_handler(
|
|
574
640
|
self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
|
|
@@ -577,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
577
643
|
|
|
578
644
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
579
645
|
"""
|
|
580
|
-
self._error_handler = handler
|
|
646
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
581
647
|
return handler
|
|
582
648
|
|
|
583
649
|
def failed_request_handler(
|
|
@@ -587,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
587
653
|
|
|
588
654
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
589
655
|
"""
|
|
590
|
-
self._failed_request_handler = handler
|
|
656
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
591
657
|
return handler
|
|
592
658
|
|
|
593
659
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -627,7 +693,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
627
693
|
request_manager = await self.get_request_manager()
|
|
628
694
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
629
695
|
await request_manager.drop()
|
|
630
|
-
self._request_manager = await RequestQueue.open(
|
|
696
|
+
self._request_manager = await RequestQueue.open(
|
|
697
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
698
|
+
configuration=self._service_locator.get_configuration(),
|
|
699
|
+
)
|
|
631
700
|
|
|
632
701
|
if requests is not None:
|
|
633
702
|
await self.add_requests(requests)
|
|
@@ -654,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
654
723
|
except CancelledError:
|
|
655
724
|
pass
|
|
656
725
|
finally:
|
|
657
|
-
await self._crawler_state_rec_task.stop()
|
|
658
726
|
if threading.current_thread() is threading.main_thread():
|
|
659
727
|
with suppress(NotImplementedError):
|
|
660
728
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -684,9 +752,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
684
752
|
return final_statistics
|
|
685
753
|
|
|
686
754
|
async def _run_crawler(self) -> None:
|
|
687
|
-
event_manager =
|
|
688
|
-
|
|
689
|
-
self._crawler_state_rec_task.start()
|
|
755
|
+
event_manager = self._service_locator.get_event_manager()
|
|
690
756
|
|
|
691
757
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
692
758
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
@@ -698,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
698
764
|
self._statistics,
|
|
699
765
|
self._session_pool if self._use_session_pool else None,
|
|
700
766
|
self._http_client,
|
|
767
|
+
self._crawler_state_rec_task,
|
|
701
768
|
*self._additional_context_managers,
|
|
702
769
|
)
|
|
703
770
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -772,6 +839,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
772
839
|
self,
|
|
773
840
|
dataset_id: str | None = None,
|
|
774
841
|
dataset_name: str | None = None,
|
|
842
|
+
dataset_alias: str | None = None,
|
|
775
843
|
**kwargs: Unpack[GetDataKwargs],
|
|
776
844
|
) -> DatasetItemsListPage:
|
|
777
845
|
"""Retrieve data from a `Dataset`.
|
|
@@ -781,13 +849,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
781
849
|
|
|
782
850
|
Args:
|
|
783
851
|
dataset_id: The ID of the `Dataset`.
|
|
784
|
-
dataset_name: The name of the `Dataset
|
|
852
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
853
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
785
854
|
kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
|
|
786
855
|
|
|
787
856
|
Returns:
|
|
788
857
|
The retrieved data.
|
|
789
858
|
"""
|
|
790
|
-
dataset = await Dataset.open(
|
|
859
|
+
dataset = await Dataset.open(
|
|
860
|
+
id=dataset_id,
|
|
861
|
+
name=dataset_name,
|
|
862
|
+
alias=dataset_alias,
|
|
863
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
864
|
+
configuration=self._service_locator.get_configuration(),
|
|
865
|
+
)
|
|
791
866
|
return await dataset.get_data(**kwargs)
|
|
792
867
|
|
|
793
868
|
async def export_data(
|
|
@@ -795,6 +870,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
795
870
|
path: str | Path,
|
|
796
871
|
dataset_id: str | None = None,
|
|
797
872
|
dataset_name: str | None = None,
|
|
873
|
+
dataset_alias: str | None = None,
|
|
874
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
|
|
798
875
|
) -> None:
|
|
799
876
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
800
877
|
|
|
@@ -804,18 +881,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
804
881
|
|
|
805
882
|
Args:
|
|
806
883
|
path: The destination file path. Must end with '.json' or '.csv'.
|
|
807
|
-
dataset_id: The ID of the Dataset to export from.
|
|
808
|
-
dataset_name: The name of the Dataset to export from
|
|
884
|
+
dataset_id: The ID of the Dataset to export from.
|
|
885
|
+
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
886
|
+
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
887
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
809
888
|
"""
|
|
810
|
-
dataset = await
|
|
889
|
+
dataset = await Dataset.open(
|
|
890
|
+
id=dataset_id,
|
|
891
|
+
name=dataset_name,
|
|
892
|
+
alias=dataset_alias,
|
|
893
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
894
|
+
configuration=self._service_locator.get_configuration(),
|
|
895
|
+
)
|
|
811
896
|
|
|
812
|
-
path =
|
|
813
|
-
dst = path.open('w', newline='')
|
|
897
|
+
path = Path(path)
|
|
814
898
|
|
|
815
899
|
if path.suffix == '.csv':
|
|
816
|
-
|
|
900
|
+
dst = StringIO()
|
|
901
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
902
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
903
|
+
await atomic_write(path, dst.getvalue())
|
|
817
904
|
elif path.suffix == '.json':
|
|
818
|
-
|
|
905
|
+
dst = StringIO()
|
|
906
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
907
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
908
|
+
await atomic_write(path, dst.getvalue())
|
|
819
909
|
else:
|
|
820
910
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
821
911
|
|
|
@@ -824,6 +914,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
824
914
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
825
915
|
dataset_id: str | None = None,
|
|
826
916
|
dataset_name: str | None = None,
|
|
917
|
+
dataset_alias: str | None = None,
|
|
827
918
|
**kwargs: Unpack[PushDataKwargs],
|
|
828
919
|
) -> None:
|
|
829
920
|
"""Push data to a `Dataset`.
|
|
@@ -834,10 +925,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
834
925
|
Args:
|
|
835
926
|
data: The data to push to the `Dataset`.
|
|
836
927
|
dataset_id: The ID of the `Dataset`.
|
|
837
|
-
dataset_name: The name of the `Dataset
|
|
928
|
+
dataset_name: The name of the `Dataset` (global scope, named storage).
|
|
929
|
+
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
|
|
838
930
|
kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
|
|
839
931
|
"""
|
|
840
|
-
dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
|
|
932
|
+
dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
|
|
841
933
|
await dataset.push_data(data, **kwargs)
|
|
842
934
|
|
|
843
935
|
def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
|
|
@@ -894,6 +986,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
894
986
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
895
987
|
| None = None,
|
|
896
988
|
requests: Sequence[str | Request] | None = None,
|
|
989
|
+
rq_id: str | None = None,
|
|
990
|
+
rq_name: str | None = None,
|
|
991
|
+
rq_alias: str | None = None,
|
|
897
992
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
898
993
|
) -> None:
|
|
899
994
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -905,7 +1000,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
905
1000
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
906
1001
|
)
|
|
907
1002
|
# Add directly passed requests.
|
|
908
|
-
await context.add_requests(
|
|
1003
|
+
await context.add_requests(
|
|
1004
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
1005
|
+
)
|
|
909
1006
|
else:
|
|
910
1007
|
# Add requests from extracted links.
|
|
911
1008
|
await context.add_requests(
|
|
@@ -914,7 +1011,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
914
1011
|
label=label,
|
|
915
1012
|
user_data=user_data,
|
|
916
1013
|
transform_request_function=transform_request_function,
|
|
1014
|
+
**kwargs,
|
|
917
1015
|
),
|
|
1016
|
+
rq_id=rq_id,
|
|
1017
|
+
rq_name=rq_name,
|
|
1018
|
+
rq_alias=rq_alias,
|
|
918
1019
|
**kwargs,
|
|
919
1020
|
)
|
|
920
1021
|
|
|
@@ -974,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
974
1075
|
return target_url.hostname == origin_url.hostname
|
|
975
1076
|
|
|
976
1077
|
if strategy == 'same-domain':
|
|
977
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
978
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1078
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1079
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
979
1080
|
return origin_domain == target_domain
|
|
980
1081
|
|
|
981
1082
|
if strategy == 'same-origin':
|
|
@@ -1031,8 +1132,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1031
1132
|
|
|
1032
1133
|
if self._should_retry_request(context, error):
|
|
1033
1134
|
request.retry_count += 1
|
|
1135
|
+
reduced_error = str(error).split('\n')[0]
|
|
1034
1136
|
self.log.warning(
|
|
1035
|
-
f'Retrying request to {context.request.url} due to: {
|
|
1137
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1036
1138
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1037
1139
|
)
|
|
1038
1140
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1043,21 +1145,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1043
1145
|
except Exception as e:
|
|
1044
1146
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1045
1147
|
else:
|
|
1046
|
-
if new_request is not None:
|
|
1047
|
-
|
|
1148
|
+
if new_request is not None and new_request != request:
|
|
1149
|
+
await request_manager.add_request(new_request)
|
|
1150
|
+
await self._mark_request_as_handled(request)
|
|
1151
|
+
return
|
|
1048
1152
|
|
|
1049
1153
|
await request_manager.reclaim_request(request)
|
|
1050
1154
|
else:
|
|
1051
|
-
await
|
|
1052
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1053
|
-
timeout=self._internal_timeout,
|
|
1054
|
-
timeout_message='Marking request as handled timed out after '
|
|
1055
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1056
|
-
logger=self._logger,
|
|
1057
|
-
max_retries=3,
|
|
1058
|
-
)
|
|
1155
|
+
await self._mark_request_as_handled(request)
|
|
1059
1156
|
await self._handle_failed_request(context, error)
|
|
1060
|
-
self._statistics.record_request_processing_failure(request.
|
|
1157
|
+
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1061
1158
|
|
|
1062
1159
|
async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
|
|
1063
1160
|
try:
|
|
@@ -1104,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1104
1201
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1105
1202
|
) -> None:
|
|
1106
1203
|
if need_mark and isinstance(request, Request):
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
await wait_for(
|
|
1110
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1111
|
-
timeout=self._internal_timeout,
|
|
1112
|
-
timeout_message='Marking request as handled timed out after '
|
|
1113
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1114
|
-
logger=self._logger,
|
|
1115
|
-
max_retries=3,
|
|
1116
|
-
)
|
|
1204
|
+
await self._mark_request_as_handled(request)
|
|
1117
1205
|
request.state = RequestState.SKIPPED
|
|
1118
1206
|
|
|
1119
1207
|
url = request.url if isinstance(request, Request) else request
|
|
@@ -1186,34 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1186
1274
|
else:
|
|
1187
1275
|
yield Request.from_url(url)
|
|
1188
1276
|
|
|
1189
|
-
async def
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1277
|
+
async def _add_requests(
|
|
1278
|
+
self,
|
|
1279
|
+
context: BasicCrawlingContext,
|
|
1280
|
+
requests: Sequence[str | Request],
|
|
1281
|
+
rq_id: str | None = None,
|
|
1282
|
+
rq_name: str | None = None,
|
|
1283
|
+
rq_alias: str | None = None,
|
|
1284
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1285
|
+
) -> None:
|
|
1286
|
+
"""Add requests method aware of the crawling context."""
|
|
1287
|
+
if rq_id or rq_name or rq_alias:
|
|
1288
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1289
|
+
id=rq_id,
|
|
1290
|
+
name=rq_name,
|
|
1291
|
+
alias=rq_alias,
|
|
1292
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1293
|
+
configuration=self._service_locator.get_configuration(),
|
|
1294
|
+
)
|
|
1295
|
+
else:
|
|
1296
|
+
request_manager = await self.get_request_manager()
|
|
1202
1297
|
|
|
1203
|
-
|
|
1298
|
+
context_aware_requests = list[Request]()
|
|
1299
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1300
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1301
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1302
|
+
for dst_request in filter_requests_iterator:
|
|
1303
|
+
# Update the crawl depth of the request.
|
|
1304
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1204
1305
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
)
|
|
1306
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1307
|
+
context_aware_requests.append(dst_request)
|
|
1208
1308
|
|
|
1209
|
-
|
|
1210
|
-
# Update the crawl depth of the request.
|
|
1211
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1309
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1212
1310
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1311
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1312
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1313
|
+
result = self._context_result_map[context]
|
|
1215
1314
|
|
|
1216
|
-
|
|
1315
|
+
for add_requests_call in result.add_requests_calls:
|
|
1316
|
+
await self._add_requests(context, **add_requests_call)
|
|
1217
1317
|
|
|
1218
1318
|
for push_data_call in result.push_data_calls:
|
|
1219
1319
|
await self._push_data(**push_data_call)
|
|
@@ -1225,8 +1325,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1225
1325
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
1226
1326
|
) -> None:
|
|
1227
1327
|
"""Store key value store changes recorded in result."""
|
|
1228
|
-
for (id, name), changes in result.key_value_store_changes.items():
|
|
1229
|
-
store = await get_kvs(id=id, name=name)
|
|
1328
|
+
for (id, name, alias), changes in result.key_value_store_changes.items():
|
|
1329
|
+
store = await get_kvs(id=id, name=name, alias=alias)
|
|
1230
1330
|
for key, value in changes.updates.items():
|
|
1231
1331
|
await store.set_value(key, value.content, value.content_type)
|
|
1232
1332
|
|
|
@@ -1274,7 +1374,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1274
1374
|
|
|
1275
1375
|
if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
|
|
1276
1376
|
self._logger.warning(
|
|
1277
|
-
f'Skipping request {request.url} ({request.
|
|
1377
|
+
f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
|
|
1278
1378
|
)
|
|
1279
1379
|
|
|
1280
1380
|
await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
|
|
@@ -1300,8 +1400,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1300
1400
|
)
|
|
1301
1401
|
self._context_result_map[context] = result
|
|
1302
1402
|
|
|
1303
|
-
|
|
1304
|
-
self._statistics.record_request_processing_start(statistics_id)
|
|
1403
|
+
self._statistics.record_request_processing_start(request.unique_key)
|
|
1305
1404
|
|
|
1306
1405
|
try:
|
|
1307
1406
|
request.state = RequestState.REQUEST_HANDLER
|
|
@@ -1314,21 +1413,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1314
1413
|
raise RequestHandlerError(e, context) from e
|
|
1315
1414
|
|
|
1316
1415
|
await self._commit_request_handler_result(context)
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
timeout=self._internal_timeout,
|
|
1320
|
-
timeout_message='Marking request as handled timed out after '
|
|
1321
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1322
|
-
logger=self._logger,
|
|
1323
|
-
max_retries=3,
|
|
1324
|
-
)
|
|
1416
|
+
|
|
1417
|
+
await self._mark_request_as_handled(request)
|
|
1325
1418
|
|
|
1326
1419
|
request.state = RequestState.DONE
|
|
1327
1420
|
|
|
1328
1421
|
if context.session and context.session.is_usable:
|
|
1329
1422
|
context.session.mark_good()
|
|
1330
1423
|
|
|
1331
|
-
self._statistics.record_request_processing_finish(
|
|
1424
|
+
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1332
1425
|
|
|
1333
1426
|
except RequestCollisionError as request_error:
|
|
1334
1427
|
context.request.no_retry = True
|
|
@@ -1364,29 +1457,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1364
1457
|
await request_manager.reclaim_request(request)
|
|
1365
1458
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1366
1459
|
else:
|
|
1367
|
-
await
|
|
1368
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1369
|
-
timeout=self._internal_timeout,
|
|
1370
|
-
timeout_message='Marking request as handled timed out after '
|
|
1371
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1372
|
-
logger=self._logger,
|
|
1373
|
-
max_retries=3,
|
|
1374
|
-
)
|
|
1460
|
+
await self._mark_request_as_handled(request)
|
|
1375
1461
|
|
|
1376
1462
|
await self._handle_failed_request(context, session_error)
|
|
1377
|
-
self._statistics.record_request_processing_failure(
|
|
1463
|
+
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1378
1464
|
|
|
1379
1465
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1380
1466
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1381
1467
|
|
|
1382
|
-
await
|
|
1383
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1384
|
-
timeout=self._internal_timeout,
|
|
1385
|
-
timeout_message='Marking request as handled timed out after '
|
|
1386
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1387
|
-
logger=self._logger,
|
|
1388
|
-
max_retries=3,
|
|
1389
|
-
)
|
|
1468
|
+
await self._mark_request_as_handled(request)
|
|
1390
1469
|
|
|
1391
1470
|
except ContextPipelineInitializationError as initialization_error:
|
|
1392
1471
|
self._logger.debug(
|
|
@@ -1404,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1404
1483
|
raise
|
|
1405
1484
|
|
|
1406
1485
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1407
|
-
await
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1486
|
+
await self._context_pipeline(
|
|
1487
|
+
context,
|
|
1488
|
+
lambda final_context: wait_for(
|
|
1489
|
+
lambda: self.router(final_context),
|
|
1490
|
+
timeout=self._request_handler_timeout,
|
|
1491
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1492
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1493
|
+
logger=self._logger,
|
|
1494
|
+
),
|
|
1413
1495
|
)
|
|
1414
1496
|
|
|
1415
1497
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1520,7 +1602,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1520
1602
|
|
|
1521
1603
|
async def _crawler_state_task(self) -> None:
|
|
1522
1604
|
"""Emit a persist state event with the given migration status."""
|
|
1523
|
-
event_manager =
|
|
1605
|
+
event_manager = self._service_locator.get_event_manager()
|
|
1524
1606
|
|
|
1525
1607
|
current_state = self.statistics.state
|
|
1526
1608
|
|
|
@@ -1557,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1557
1639
|
)
|
|
1558
1640
|
|
|
1559
1641
|
self._previous_crawler_state = current_state
|
|
1642
|
+
|
|
1643
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1644
|
+
request_manager = await self.get_request_manager()
|
|
1645
|
+
await wait_for(
|
|
1646
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1647
|
+
timeout=self._internal_timeout,
|
|
1648
|
+
timeout_message='Marking request as handled timed out after '
|
|
1649
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1650
|
+
logger=self._logger,
|
|
1651
|
+
max_retries=3,
|
|
1652
|
+
)
|