crawlee 1.0.2b3__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +30 -17
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +126 -112
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +55 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +27 -9
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +3 -2
- crawlee/storage_clients/_sql/_request_queue_client.py +18 -4
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storages/_key_value_store.py +5 -2
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +8 -3
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +66 -54
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
13
|
from crawlee._request import Request, RequestOptions
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
|
|
|
112
130
|
async def _execute_pre_navigation_hooks(
|
|
113
131
|
self, context: BasicCrawlingContext
|
|
114
132
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
133
|
+
context_id = id(context)
|
|
134
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for hook in self._pre_navigation_hooks:
|
|
138
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
139
|
+
await hook(context)
|
|
140
|
+
|
|
141
|
+
yield context
|
|
142
|
+
finally:
|
|
143
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
144
|
|
|
119
145
|
async def _parse_http_response(
|
|
120
146
|
self, context: HttpCrawlingContext
|
|
@@ -167,7 +193,15 @@ class AbstractHttpCrawler(
|
|
|
167
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
168
194
|
|
|
169
195
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
196
|
+
|
|
197
|
+
# Get base URL from <base> tag if present
|
|
198
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
199
|
+
base_url: str = (
|
|
200
|
+
str(extracted_base_urls[0])
|
|
201
|
+
if extracted_base_urls
|
|
202
|
+
else context.request.loaded_url or context.request.url
|
|
203
|
+
)
|
|
204
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
171
205
|
|
|
172
206
|
if robots_txt_file:
|
|
173
207
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -214,12 +248,14 @@ class AbstractHttpCrawler(
|
|
|
214
248
|
Yields:
|
|
215
249
|
The original crawling context enhanced by HTTP response.
|
|
216
250
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
251
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
252
|
+
result = await self._http_client.crawl(
|
|
253
|
+
request=context.request,
|
|
254
|
+
session=context.session,
|
|
255
|
+
proxy_info=context.proxy_info,
|
|
256
|
+
statistics=self._statistics,
|
|
257
|
+
timeout=remaining_timeout,
|
|
258
|
+
)
|
|
223
259
|
|
|
224
260
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
225
261
|
|
|
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
71
|
async def __aenter__(self) -> Self:
|
|
72
72
|
self._active = True
|
|
73
73
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
74
|
return self
|
|
76
75
|
|
|
77
76
|
async def __aexit__(
|
|
@@ -149,10 +148,6 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
148
|
non-default configuration.
|
|
150
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
150
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
151
|
# Adaptive crawling related.
|
|
157
152
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
153
|
self.result_checker = result_checker or (lambda _: True)
|
|
@@ -170,11 +165,11 @@ class AdaptivePlaywrightCrawler(
|
|
|
170
165
|
# Each sub crawler will use custom logger .
|
|
171
166
|
static_logger = getLogger('Subcrawler_static')
|
|
172
167
|
static_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_static_crawler
|
|
168
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
174
169
|
|
|
175
170
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
176
171
|
pw_logger.setLevel(logging.ERROR)
|
|
177
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
172
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
178
173
|
|
|
179
174
|
# Initialize sub crawlers to create their pipelines.
|
|
180
175
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -319,7 +314,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
319
314
|
),
|
|
320
315
|
logger=self._logger,
|
|
321
316
|
)
|
|
322
|
-
return SubCrawlerRun(result=result)
|
|
317
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
323
318
|
except Exception as e:
|
|
324
319
|
return SubCrawlerRun(exception=e)
|
|
325
320
|
|
|
@@ -375,7 +370,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
375
370
|
self.track_http_only_request_handler_runs()
|
|
376
371
|
|
|
377
372
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
378
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
373
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
374
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
379
375
|
self._context_result_map[context] = static_run.result
|
|
380
376
|
return
|
|
381
377
|
if static_run.exception:
|
|
@@ -406,13 +402,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
406
402
|
if pw_run.exception is not None:
|
|
407
403
|
raise pw_run.exception
|
|
408
404
|
|
|
409
|
-
if pw_run.result:
|
|
410
|
-
self._context_result_map[context] = pw_run.result
|
|
411
|
-
|
|
405
|
+
if pw_run.result and pw_run.run_context:
|
|
412
406
|
if should_detect_rendering_type:
|
|
413
407
|
detection_result: RenderingType
|
|
414
408
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
415
|
-
|
|
416
409
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
417
410
|
detection_result = 'static'
|
|
418
411
|
else:
|
|
@@ -421,6 +414,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
421
414
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
422
415
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
423
416
|
|
|
417
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
|
+
self._context_result_map[context] = pw_run.result
|
|
419
|
+
|
|
424
420
|
def pre_navigation_hook(
|
|
425
421
|
self,
|
|
426
422
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -455,8 +451,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
455
451
|
def track_rendering_type_mispredictions(self) -> None:
|
|
456
452
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
457
453
|
|
|
454
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
+
|
|
457
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
+
allowing state synchronization after isolated crawler execution.
|
|
459
|
+
"""
|
|
460
|
+
updating_attributes = {
|
|
461
|
+
'request': ('headers', 'user_data'),
|
|
462
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
+
original_sub_obj = getattr(context, attr)
|
|
467
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
+
|
|
469
|
+
# Check that both sub objects are not None
|
|
470
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
for sub_attr in sub_attrs:
|
|
474
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
+
|
|
458
477
|
|
|
459
478
|
@dataclass(frozen=True)
|
|
460
479
|
class SubCrawlerRun:
|
|
461
480
|
result: RequestHandlerRunResult | None = None
|
|
462
481
|
exception: Exception | None = None
|
|
482
|
+
run_context: BasicCrawlingContext | None = None
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
|
|
|
96
100
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
101
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
102
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
103
|
+
TParams = ParamSpec('TParams')
|
|
104
|
+
T = TypeVar('T')
|
|
105
|
+
|
|
99
106
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
107
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
108
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -437,14 +444,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
437
444
|
self._statistics_log_format = statistics_log_format
|
|
438
445
|
|
|
439
446
|
# Statistics
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
447
|
+
if statistics:
|
|
448
|
+
self._statistics = statistics
|
|
449
|
+
else:
|
|
450
|
+
|
|
451
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
452
|
+
return await self.get_key_value_store()
|
|
453
|
+
|
|
454
|
+
self._statistics = cast(
|
|
455
|
+
'Statistics[TStatisticsState]',
|
|
456
|
+
Statistics.with_default_state(
|
|
457
|
+
persistence_enabled=True,
|
|
458
|
+
periodic_message_logger=self._logger,
|
|
459
|
+
statistics_log_format=self._statistics_log_format,
|
|
460
|
+
log_message='Current request statistics:',
|
|
461
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
462
|
+
),
|
|
463
|
+
)
|
|
448
464
|
|
|
449
465
|
# Additional context managers to enter and exit
|
|
450
466
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -511,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
511
527
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
512
528
|
self._unexpected_stop = True
|
|
513
529
|
|
|
530
|
+
def _wrap_handler_with_error_context(
|
|
531
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
532
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
533
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
534
|
+
|
|
535
|
+
@functools.wraps(handler)
|
|
536
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
537
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
538
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
539
|
+
error_context = context.create_modified_copy(
|
|
540
|
+
push_data=self._push_data,
|
|
541
|
+
get_key_value_store=self.get_key_value_store,
|
|
542
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
543
|
+
)
|
|
544
|
+
return await handler(error_context, exception)
|
|
545
|
+
|
|
546
|
+
return wrapped_handler
|
|
547
|
+
|
|
514
548
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
515
549
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
516
550
|
if self._max_requests_per_crawl is None:
|
|
@@ -609,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
609
643
|
|
|
610
644
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
611
645
|
"""
|
|
612
|
-
self._error_handler = handler
|
|
646
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
613
647
|
return handler
|
|
614
648
|
|
|
615
649
|
def failed_request_handler(
|
|
@@ -619,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
619
653
|
|
|
620
654
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
621
655
|
"""
|
|
622
|
-
self._failed_request_handler = handler
|
|
656
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
623
657
|
return handler
|
|
624
658
|
|
|
625
659
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -689,7 +723,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
689
723
|
except CancelledError:
|
|
690
724
|
pass
|
|
691
725
|
finally:
|
|
692
|
-
await self._crawler_state_rec_task.stop()
|
|
693
726
|
if threading.current_thread() is threading.main_thread():
|
|
694
727
|
with suppress(NotImplementedError):
|
|
695
728
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -721,8 +754,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
721
754
|
async def _run_crawler(self) -> None:
|
|
722
755
|
event_manager = self._service_locator.get_event_manager()
|
|
723
756
|
|
|
724
|
-
self._crawler_state_rec_task.start()
|
|
725
|
-
|
|
726
757
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
727
758
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
728
759
|
contexts_to_enter = [
|
|
@@ -733,6 +764,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
733
764
|
self._statistics,
|
|
734
765
|
self._session_pool if self._use_session_pool else None,
|
|
735
766
|
self._http_client,
|
|
767
|
+
self._crawler_state_rec_task,
|
|
736
768
|
*self._additional_context_managers,
|
|
737
769
|
)
|
|
738
770
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -839,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
839
871
|
dataset_id: str | None = None,
|
|
840
872
|
dataset_name: str | None = None,
|
|
841
873
|
dataset_alias: str | None = None,
|
|
874
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
|
|
842
875
|
) -> None:
|
|
843
876
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
844
877
|
|
|
@@ -851,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
851
884
|
dataset_id: The ID of the Dataset to export from.
|
|
852
885
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
853
886
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
887
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
854
888
|
"""
|
|
855
889
|
dataset = await Dataset.open(
|
|
856
890
|
id=dataset_id,
|
|
@@ -860,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
860
894
|
configuration=self._service_locator.get_configuration(),
|
|
861
895
|
)
|
|
862
896
|
|
|
863
|
-
path =
|
|
864
|
-
dst = path.open('w', newline='')
|
|
897
|
+
path = Path(path)
|
|
865
898
|
|
|
866
899
|
if path.suffix == '.csv':
|
|
867
|
-
|
|
900
|
+
dst = StringIO()
|
|
901
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
902
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
903
|
+
await atomic_write(path, dst.getvalue())
|
|
868
904
|
elif path.suffix == '.json':
|
|
869
|
-
|
|
905
|
+
dst = StringIO()
|
|
906
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
907
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
908
|
+
await atomic_write(path, dst.getvalue())
|
|
870
909
|
else:
|
|
871
910
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
872
911
|
|
|
@@ -972,6 +1011,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
972
1011
|
label=label,
|
|
973
1012
|
user_data=user_data,
|
|
974
1013
|
transform_request_function=transform_request_function,
|
|
1014
|
+
**kwargs,
|
|
975
1015
|
),
|
|
976
1016
|
rq_id=rq_id,
|
|
977
1017
|
rq_name=rq_name,
|
|
@@ -1035,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1035
1075
|
return target_url.hostname == origin_url.hostname
|
|
1036
1076
|
|
|
1037
1077
|
if strategy == 'same-domain':
|
|
1038
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1039
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1078
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1079
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1040
1080
|
return origin_domain == target_domain
|
|
1041
1081
|
|
|
1042
1082
|
if strategy == 'same-origin':
|
|
@@ -1105,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1105
1145
|
except Exception as e:
|
|
1106
1146
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1107
1147
|
else:
|
|
1108
|
-
if new_request is not None:
|
|
1109
|
-
|
|
1148
|
+
if new_request is not None and new_request != request:
|
|
1149
|
+
await request_manager.add_request(new_request)
|
|
1150
|
+
await self._mark_request_as_handled(request)
|
|
1151
|
+
return
|
|
1110
1152
|
|
|
1111
1153
|
await request_manager.reclaim_request(request)
|
|
1112
1154
|
else:
|
|
1113
|
-
await
|
|
1114
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1115
|
-
timeout=self._internal_timeout,
|
|
1116
|
-
timeout_message='Marking request as handled timed out after '
|
|
1117
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1118
|
-
logger=self._logger,
|
|
1119
|
-
max_retries=3,
|
|
1120
|
-
)
|
|
1155
|
+
await self._mark_request_as_handled(request)
|
|
1121
1156
|
await self._handle_failed_request(context, error)
|
|
1122
1157
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1123
1158
|
|
|
@@ -1166,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1166
1201
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1167
1202
|
) -> None:
|
|
1168
1203
|
if need_mark and isinstance(request, Request):
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
await wait_for(
|
|
1172
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1173
|
-
timeout=self._internal_timeout,
|
|
1174
|
-
timeout_message='Marking request as handled timed out after '
|
|
1175
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1176
|
-
logger=self._logger,
|
|
1177
|
-
max_retries=3,
|
|
1178
|
-
)
|
|
1204
|
+
await self._mark_request_as_handled(request)
|
|
1179
1205
|
request.state = RequestState.SKIPPED
|
|
1180
1206
|
|
|
1181
1207
|
url = request.url if isinstance(request, Request) else request
|
|
@@ -1248,52 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1248
1274
|
else:
|
|
1249
1275
|
yield Request.from_url(url)
|
|
1250
1276
|
|
|
1251
|
-
async def
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1272
|
-
configuration=self._service_locator.get_configuration(),
|
|
1273
|
-
)
|
|
1274
|
-
else:
|
|
1275
|
-
request_manager = base_request_manager
|
|
1276
|
-
|
|
1277
|
-
requests = list[Request]()
|
|
1278
|
-
|
|
1279
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1280
|
-
|
|
1281
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1277
|
+
async def _add_requests(
|
|
1278
|
+
self,
|
|
1279
|
+
context: BasicCrawlingContext,
|
|
1280
|
+
requests: Sequence[str | Request],
|
|
1281
|
+
rq_id: str | None = None,
|
|
1282
|
+
rq_name: str | None = None,
|
|
1283
|
+
rq_alias: str | None = None,
|
|
1284
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1285
|
+
) -> None:
|
|
1286
|
+
"""Add requests method aware of the crawling context."""
|
|
1287
|
+
if rq_id or rq_name or rq_alias:
|
|
1288
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1289
|
+
id=rq_id,
|
|
1290
|
+
name=rq_name,
|
|
1291
|
+
alias=rq_alias,
|
|
1292
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1293
|
+
configuration=self._service_locator.get_configuration(),
|
|
1294
|
+
)
|
|
1295
|
+
else:
|
|
1296
|
+
request_manager = await self.get_request_manager()
|
|
1282
1297
|
|
|
1283
|
-
|
|
1298
|
+
context_aware_requests = list[Request]()
|
|
1299
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1300
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1301
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1302
|
+
for dst_request in filter_requests_iterator:
|
|
1303
|
+
# Update the crawl depth of the request.
|
|
1304
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1284
1305
|
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
)
|
|
1306
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1307
|
+
context_aware_requests.append(dst_request)
|
|
1288
1308
|
|
|
1289
|
-
|
|
1290
|
-
# Update the crawl depth of the request.
|
|
1291
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1309
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1292
1310
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1311
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1312
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1313
|
+
result = self._context_result_map[context]
|
|
1295
1314
|
|
|
1296
|
-
|
|
1315
|
+
for add_requests_call in result.add_requests_calls:
|
|
1316
|
+
await self._add_requests(context, **add_requests_call)
|
|
1297
1317
|
|
|
1298
1318
|
for push_data_call in result.push_data_calls:
|
|
1299
1319
|
await self._push_data(**push_data_call)
|
|
@@ -1393,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1393
1413
|
raise RequestHandlerError(e, context) from e
|
|
1394
1414
|
|
|
1395
1415
|
await self._commit_request_handler_result(context)
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
timeout=self._internal_timeout,
|
|
1399
|
-
timeout_message='Marking request as handled timed out after '
|
|
1400
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1401
|
-
logger=self._logger,
|
|
1402
|
-
max_retries=3,
|
|
1403
|
-
)
|
|
1416
|
+
|
|
1417
|
+
await self._mark_request_as_handled(request)
|
|
1404
1418
|
|
|
1405
1419
|
request.state = RequestState.DONE
|
|
1406
1420
|
|
|
@@ -1443,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1443
1457
|
await request_manager.reclaim_request(request)
|
|
1444
1458
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1445
1459
|
else:
|
|
1446
|
-
await
|
|
1447
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1448
|
-
timeout=self._internal_timeout,
|
|
1449
|
-
timeout_message='Marking request as handled timed out after '
|
|
1450
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1451
|
-
logger=self._logger,
|
|
1452
|
-
max_retries=3,
|
|
1453
|
-
)
|
|
1460
|
+
await self._mark_request_as_handled(request)
|
|
1454
1461
|
|
|
1455
1462
|
await self._handle_failed_request(context, session_error)
|
|
1456
1463
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1458,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1458
1465
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1459
1466
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1460
1467
|
|
|
1461
|
-
await
|
|
1462
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1463
|
-
timeout=self._internal_timeout,
|
|
1464
|
-
timeout_message='Marking request as handled timed out after '
|
|
1465
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1466
|
-
logger=self._logger,
|
|
1467
|
-
max_retries=3,
|
|
1468
|
-
)
|
|
1468
|
+
await self._mark_request_as_handled(request)
|
|
1469
1469
|
|
|
1470
1470
|
except ContextPipelineInitializationError as initialization_error:
|
|
1471
1471
|
self._logger.debug(
|
|
@@ -1483,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1483
1483
|
raise
|
|
1484
1484
|
|
|
1485
1485
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1486
|
-
await
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1486
|
+
await self._context_pipeline(
|
|
1487
|
+
context,
|
|
1488
|
+
lambda final_context: wait_for(
|
|
1489
|
+
lambda: self.router(final_context),
|
|
1490
|
+
timeout=self._request_handler_timeout,
|
|
1491
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1492
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1493
|
+
logger=self._logger,
|
|
1494
|
+
),
|
|
1492
1495
|
)
|
|
1493
1496
|
|
|
1494
1497
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1636,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1636
1639
|
)
|
|
1637
1640
|
|
|
1638
1641
|
self._previous_crawler_state = current_state
|
|
1642
|
+
|
|
1643
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1644
|
+
request_manager = await self.get_request_manager()
|
|
1645
|
+
await wait_for(
|
|
1646
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1647
|
+
timeout=self._internal_timeout,
|
|
1648
|
+
timeout_message='Marking request as handled timed out after '
|
|
1649
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1650
|
+
logger=self._logger,
|
|
1651
|
+
max_retries=3,
|
|
1652
|
+
)
|