crawlee 1.1.1b1__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_types.py +20 -1
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/time.py +41 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +46 -12
- crawlee/crawlers/_basic/_basic_crawler.py +107 -101
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +40 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +4 -3
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +24 -24
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
crawlee/_types.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
import re
|
|
16
16
|
from collections.abc import Callable, Coroutine, Sequence
|
|
17
17
|
|
|
18
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
18
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
19
19
|
|
|
20
20
|
from crawlee import Glob, Request
|
|
21
21
|
from crawlee._request import RequestOptions
|
|
@@ -643,6 +643,25 @@ class BasicCrawlingContext:
|
|
|
643
643
|
"""Return hash of the context. Each context is considered unique."""
|
|
644
644
|
return id(self)
|
|
645
645
|
|
|
646
|
+
def create_modified_copy(
|
|
647
|
+
self,
|
|
648
|
+
push_data: PushDataFunction | None = None,
|
|
649
|
+
add_requests: AddRequestsFunction | None = None,
|
|
650
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
651
|
+
) -> Self:
|
|
652
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
653
|
+
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
|
|
654
|
+
modified_fields = {
|
|
655
|
+
key: value
|
|
656
|
+
for key, value in {
|
|
657
|
+
'push_data': push_data,
|
|
658
|
+
'add_requests': add_requests,
|
|
659
|
+
'get_key_value_store': get_key_value_store,
|
|
660
|
+
}.items()
|
|
661
|
+
if value
|
|
662
|
+
}
|
|
663
|
+
return self.__class__(**{**original_fields, **modified_fields})
|
|
664
|
+
|
|
646
665
|
|
|
647
666
|
class GetDataKwargs(TypedDict):
|
|
648
667
|
"""Keyword arguments for dataset's `get_data` method."""
|
crawlee/_utils/file.py
CHANGED
|
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
|
|
|
163
163
|
dst: TextIO,
|
|
164
164
|
**kwargs: Unpack[ExportDataCsvKwargs],
|
|
165
165
|
) -> None:
|
|
166
|
+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
|
|
167
|
+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
|
|
168
|
+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
|
|
169
|
+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
|
|
170
|
+
if 'lineterminator' not in kwargs:
|
|
171
|
+
kwargs['lineterminator'] = '\n'
|
|
172
|
+
|
|
166
173
|
writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
|
|
167
174
|
write_header = True
|
|
168
175
|
|
crawlee/_utils/time.py
CHANGED
|
@@ -3,11 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import time
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
9
|
+
from async_timeout import Timeout, timeout
|
|
10
|
+
|
|
8
11
|
if TYPE_CHECKING:
|
|
9
12
|
from collections.abc import Iterator
|
|
10
|
-
from
|
|
13
|
+
from types import TracebackType
|
|
11
14
|
|
|
12
15
|
_SECONDS_PER_MINUTE = 60
|
|
13
16
|
_SECONDS_PER_HOUR = 3600
|
|
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
|
|
|
35
38
|
result.cpu = after_cpu - before_cpu
|
|
36
39
|
|
|
37
40
|
|
|
41
|
+
class SharedTimeout:
|
|
42
|
+
"""Keeps track of a time budget shared by multiple independent async operations.
|
|
43
|
+
|
|
44
|
+
Provides a reusable, non-reentrant context manager interface.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, timeout: timedelta) -> None:
|
|
48
|
+
self._remaining_timeout = timeout
|
|
49
|
+
self._active_timeout: Timeout | None = None
|
|
50
|
+
self._activation_timestamp: float | None = None
|
|
51
|
+
|
|
52
|
+
async def __aenter__(self) -> timedelta:
|
|
53
|
+
if self._active_timeout is not None or self._activation_timestamp is not None:
|
|
54
|
+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
|
|
55
|
+
|
|
56
|
+
self._activation_timestamp = time.monotonic()
|
|
57
|
+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
|
|
58
|
+
await new_timeout.__aenter__()
|
|
59
|
+
return self._remaining_timeout
|
|
60
|
+
|
|
61
|
+
async def __aexit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: type[BaseException] | None,
|
|
64
|
+
exc_value: BaseException | None,
|
|
65
|
+
exc_traceback: TracebackType | None,
|
|
66
|
+
) -> None:
|
|
67
|
+
if self._active_timeout is None or self._activation_timestamp is None:
|
|
68
|
+
raise RuntimeError('Logic error')
|
|
69
|
+
|
|
70
|
+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
|
|
71
|
+
elapsed = time.monotonic() - self._activation_timestamp
|
|
72
|
+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
|
|
73
|
+
|
|
74
|
+
self._active_timeout = None
|
|
75
|
+
self._activation_timestamp = None
|
|
76
|
+
|
|
77
|
+
|
|
38
78
|
def format_duration(duration: timedelta | None) -> str:
|
|
39
79
|
"""Format a timedelta into a human-readable string with appropriate units."""
|
|
40
80
|
if duration is None:
|
crawlee/crawlers/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
2
|
from crawlee._utils.try_import import try_import as _try_import
|
|
3
3
|
|
|
4
|
-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
|
|
4
|
+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
|
|
5
5
|
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
|
|
6
6
|
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
|
|
7
7
|
|
|
@@ -51,6 +51,7 @@ __all__ = [
|
|
|
51
51
|
'BeautifulSoupParserType',
|
|
52
52
|
'ContextPipeline',
|
|
53
53
|
'HttpCrawler',
|
|
54
|
+
'HttpCrawlerOptions',
|
|
54
55
|
'HttpCrawlingContext',
|
|
55
56
|
'HttpCrawlingResult',
|
|
56
57
|
'ParsedHttpCrawlingContext',
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from ._abstract_http_crawler import AbstractHttpCrawler
|
|
1
|
+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
|
|
2
2
|
from ._abstract_http_parser import AbstractHttpParser
|
|
3
3
|
from ._http_crawling_context import ParsedHttpCrawlingContext
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
'AbstractHttpCrawler',
|
|
7
7
|
'AbstractHttpParser',
|
|
8
|
+
'HttpCrawlerOptions',
|
|
8
9
|
'ParsedHttpCrawlingContext',
|
|
9
10
|
]
|
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
13
|
from crawlee._request import Request, RequestOptions
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
|
|
|
112
130
|
async def _execute_pre_navigation_hooks(
|
|
113
131
|
self, context: BasicCrawlingContext
|
|
114
132
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
133
|
+
context_id = id(context)
|
|
134
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for hook in self._pre_navigation_hooks:
|
|
138
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
139
|
+
await hook(context)
|
|
140
|
+
|
|
141
|
+
yield context
|
|
142
|
+
finally:
|
|
143
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
144
|
|
|
119
145
|
async def _parse_http_response(
|
|
120
146
|
self, context: HttpCrawlingContext
|
|
@@ -167,9 +193,15 @@ class AbstractHttpCrawler(
|
|
|
167
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
168
194
|
|
|
169
195
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
171
|
-
|
|
196
|
+
|
|
197
|
+
# Get base URL from <base> tag if present
|
|
198
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
199
|
+
base_url: str = (
|
|
200
|
+
str(extracted_base_urls[0])
|
|
201
|
+
if extracted_base_urls
|
|
202
|
+
else context.request.loaded_url or context.request.url
|
|
172
203
|
)
|
|
204
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
173
205
|
|
|
174
206
|
if robots_txt_file:
|
|
175
207
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -216,12 +248,14 @@ class AbstractHttpCrawler(
|
|
|
216
248
|
Yields:
|
|
217
249
|
The original crawling context enhanced by HTTP response.
|
|
218
250
|
"""
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
251
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
252
|
+
result = await self._http_client.crawl(
|
|
253
|
+
request=context.request,
|
|
254
|
+
session=context.session,
|
|
255
|
+
proxy_info=context.proxy_info,
|
|
256
|
+
statistics=self._statistics,
|
|
257
|
+
timeout=remaining_timeout,
|
|
258
|
+
)
|
|
225
259
|
|
|
226
260
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
227
261
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
|
|
|
13
14
|
from contextlib import AsyncExitStack, suppress
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
17
|
+
from io import StringIO
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
20
|
from urllib.parse import ParseResult, urlparse
|
|
19
21
|
from weakref import WeakKeyDictionary
|
|
20
22
|
|
|
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
|
|
|
31
33
|
from crawlee._types import (
|
|
32
34
|
BasicCrawlingContext,
|
|
33
35
|
EnqueueLinksKwargs,
|
|
36
|
+
ExportDataCsvKwargs,
|
|
37
|
+
ExportDataJsonKwargs,
|
|
34
38
|
GetKeyValueStoreFromRequestHandlerFunction,
|
|
35
39
|
HttpHeaders,
|
|
36
40
|
HttpPayload,
|
|
@@ -40,7 +44,7 @@ from crawlee._types import (
|
|
|
40
44
|
SkippedReason,
|
|
41
45
|
)
|
|
42
46
|
from crawlee._utils.docs import docs_group
|
|
43
|
-
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
47
|
+
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
|
|
44
48
|
from crawlee._utils.recurring_task import RecurringTask
|
|
45
49
|
from crawlee._utils.robots import RobotsTxtFile
|
|
46
50
|
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
|
@@ -96,6 +100,9 @@ if TYPE_CHECKING:
|
|
|
96
100
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
101
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
102
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
103
|
+
TParams = ParamSpec('TParams')
|
|
104
|
+
T = TypeVar('T')
|
|
105
|
+
|
|
99
106
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
107
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
108
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -520,6 +527,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
520
527
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
521
528
|
self._unexpected_stop = True
|
|
522
529
|
|
|
530
|
+
def _wrap_handler_with_error_context(
|
|
531
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
532
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
533
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
534
|
+
|
|
535
|
+
@functools.wraps(handler)
|
|
536
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
537
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
538
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
539
|
+
error_context = context.create_modified_copy(
|
|
540
|
+
push_data=self._push_data,
|
|
541
|
+
get_key_value_store=self.get_key_value_store,
|
|
542
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
543
|
+
)
|
|
544
|
+
return await handler(error_context, exception)
|
|
545
|
+
|
|
546
|
+
return wrapped_handler
|
|
547
|
+
|
|
523
548
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
524
549
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
525
550
|
if self._max_requests_per_crawl is None:
|
|
@@ -618,7 +643,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
618
643
|
|
|
619
644
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
620
645
|
"""
|
|
621
|
-
self._error_handler = handler
|
|
646
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
622
647
|
return handler
|
|
623
648
|
|
|
624
649
|
def failed_request_handler(
|
|
@@ -628,7 +653,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
628
653
|
|
|
629
654
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
630
655
|
"""
|
|
631
|
-
self._failed_request_handler = handler
|
|
656
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
632
657
|
return handler
|
|
633
658
|
|
|
634
659
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -846,6 +871,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
846
871
|
dataset_id: str | None = None,
|
|
847
872
|
dataset_name: str | None = None,
|
|
848
873
|
dataset_alias: str | None = None,
|
|
874
|
+
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
|
|
849
875
|
) -> None:
|
|
850
876
|
"""Export all items from a Dataset to a JSON or CSV file.
|
|
851
877
|
|
|
@@ -858,6 +884,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
858
884
|
dataset_id: The ID of the Dataset to export from.
|
|
859
885
|
dataset_name: The name of the Dataset to export from (global scope, named storage).
|
|
860
886
|
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
|
|
887
|
+
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
|
|
861
888
|
"""
|
|
862
889
|
dataset = await Dataset.open(
|
|
863
890
|
id=dataset_id,
|
|
@@ -867,13 +894,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
867
894
|
configuration=self._service_locator.get_configuration(),
|
|
868
895
|
)
|
|
869
896
|
|
|
870
|
-
path =
|
|
871
|
-
dst = path.open('w', newline='')
|
|
897
|
+
path = Path(path)
|
|
872
898
|
|
|
873
899
|
if path.suffix == '.csv':
|
|
874
|
-
|
|
900
|
+
dst = StringIO()
|
|
901
|
+
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
|
|
902
|
+
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
|
|
903
|
+
await atomic_write(path, dst.getvalue())
|
|
875
904
|
elif path.suffix == '.json':
|
|
876
|
-
|
|
905
|
+
dst = StringIO()
|
|
906
|
+
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
|
|
907
|
+
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
|
|
908
|
+
await atomic_write(path, dst.getvalue())
|
|
877
909
|
else:
|
|
878
910
|
raise ValueError(f'Unsupported file extension: {path.suffix}')
|
|
879
911
|
|
|
@@ -1043,8 +1075,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1043
1075
|
return target_url.hostname == origin_url.hostname
|
|
1044
1076
|
|
|
1045
1077
|
if strategy == 'same-domain':
|
|
1046
|
-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).
|
|
1047
|
-
target_domain = self._tld_extractor.extract_str(target_url.hostname).
|
|
1078
|
+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
|
|
1079
|
+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
|
|
1048
1080
|
return origin_domain == target_domain
|
|
1049
1081
|
|
|
1050
1082
|
if strategy == 'same-origin':
|
|
@@ -1113,19 +1145,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1113
1145
|
except Exception as e:
|
|
1114
1146
|
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
|
|
1115
1147
|
else:
|
|
1116
|
-
if new_request is not None:
|
|
1117
|
-
|
|
1148
|
+
if new_request is not None and new_request != request:
|
|
1149
|
+
await request_manager.add_request(new_request)
|
|
1150
|
+
await self._mark_request_as_handled(request)
|
|
1151
|
+
return
|
|
1118
1152
|
|
|
1119
1153
|
await request_manager.reclaim_request(request)
|
|
1120
1154
|
else:
|
|
1121
|
-
await
|
|
1122
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1123
|
-
timeout=self._internal_timeout,
|
|
1124
|
-
timeout_message='Marking request as handled timed out after '
|
|
1125
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1126
|
-
logger=self._logger,
|
|
1127
|
-
max_retries=3,
|
|
1128
|
-
)
|
|
1155
|
+
await self._mark_request_as_handled(request)
|
|
1129
1156
|
await self._handle_failed_request(context, error)
|
|
1130
1157
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
1131
1158
|
|
|
@@ -1174,16 +1201,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1174
1201
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1175
1202
|
) -> None:
|
|
1176
1203
|
if need_mark and isinstance(request, Request):
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
await wait_for(
|
|
1180
|
-
lambda: request_manager.mark_request_as_handled(request),
|
|
1181
|
-
timeout=self._internal_timeout,
|
|
1182
|
-
timeout_message='Marking request as handled timed out after '
|
|
1183
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1184
|
-
logger=self._logger,
|
|
1185
|
-
max_retries=3,
|
|
1186
|
-
)
|
|
1204
|
+
await self._mark_request_as_handled(request)
|
|
1187
1205
|
request.state = RequestState.SKIPPED
|
|
1188
1206
|
|
|
1189
1207
|
url = request.url if isinstance(request, Request) else request
|
|
@@ -1256,52 +1274,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1256
1274
|
else:
|
|
1257
1275
|
yield Request.from_url(url)
|
|
1258
1276
|
|
|
1259
|
-
async def
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
-
configuration=self._service_locator.get_configuration(),
|
|
1281
|
-
)
|
|
1282
|
-
else:
|
|
1283
|
-
request_manager = base_request_manager
|
|
1284
|
-
|
|
1285
|
-
requests = list[Request]()
|
|
1286
|
-
|
|
1287
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1288
|
-
|
|
1289
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1277
|
+
async def _add_requests(
|
|
1278
|
+
self,
|
|
1279
|
+
context: BasicCrawlingContext,
|
|
1280
|
+
requests: Sequence[str | Request],
|
|
1281
|
+
rq_id: str | None = None,
|
|
1282
|
+
rq_name: str | None = None,
|
|
1283
|
+
rq_alias: str | None = None,
|
|
1284
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1285
|
+
) -> None:
|
|
1286
|
+
"""Add requests method aware of the crawling context."""
|
|
1287
|
+
if rq_id or rq_name or rq_alias:
|
|
1288
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1289
|
+
id=rq_id,
|
|
1290
|
+
name=rq_name,
|
|
1291
|
+
alias=rq_alias,
|
|
1292
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1293
|
+
configuration=self._service_locator.get_configuration(),
|
|
1294
|
+
)
|
|
1295
|
+
else:
|
|
1296
|
+
request_manager = await self.get_request_manager()
|
|
1290
1297
|
|
|
1291
|
-
|
|
1298
|
+
context_aware_requests = list[Request]()
|
|
1299
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1300
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1301
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1302
|
+
for dst_request in filter_requests_iterator:
|
|
1303
|
+
# Update the crawl depth of the request.
|
|
1304
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1292
1305
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
)
|
|
1306
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1307
|
+
context_aware_requests.append(dst_request)
|
|
1296
1308
|
|
|
1297
|
-
|
|
1298
|
-
# Update the crawl depth of the request.
|
|
1299
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1309
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1300
1310
|
|
|
1301
|
-
|
|
1302
|
-
|
|
1311
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1312
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1313
|
+
result = self._context_result_map[context]
|
|
1303
1314
|
|
|
1304
|
-
|
|
1315
|
+
for add_requests_call in result.add_requests_calls:
|
|
1316
|
+
await self._add_requests(context, **add_requests_call)
|
|
1305
1317
|
|
|
1306
1318
|
for push_data_call in result.push_data_calls:
|
|
1307
1319
|
await self._push_data(**push_data_call)
|
|
@@ -1401,14 +1413,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1401
1413
|
raise RequestHandlerError(e, context) from e
|
|
1402
1414
|
|
|
1403
1415
|
await self._commit_request_handler_result(context)
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
timeout=self._internal_timeout,
|
|
1407
|
-
timeout_message='Marking request as handled timed out after '
|
|
1408
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1409
|
-
logger=self._logger,
|
|
1410
|
-
max_retries=3,
|
|
1411
|
-
)
|
|
1416
|
+
|
|
1417
|
+
await self._mark_request_as_handled(request)
|
|
1412
1418
|
|
|
1413
1419
|
request.state = RequestState.DONE
|
|
1414
1420
|
|
|
@@ -1451,14 +1457,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1451
1457
|
await request_manager.reclaim_request(request)
|
|
1452
1458
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
1453
1459
|
else:
|
|
1454
|
-
await
|
|
1455
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1456
|
-
timeout=self._internal_timeout,
|
|
1457
|
-
timeout_message='Marking request as handled timed out after '
|
|
1458
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1459
|
-
logger=self._logger,
|
|
1460
|
-
max_retries=3,
|
|
1461
|
-
)
|
|
1460
|
+
await self._mark_request_as_handled(request)
|
|
1462
1461
|
|
|
1463
1462
|
await self._handle_failed_request(context, session_error)
|
|
1464
1463
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1466,14 +1465,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1466
1465
|
except ContextPipelineInterruptedError as interrupted_error:
|
|
1467
1466
|
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
|
|
1468
1467
|
|
|
1469
|
-
await
|
|
1470
|
-
lambda: request_manager.mark_request_as_handled(context.request),
|
|
1471
|
-
timeout=self._internal_timeout,
|
|
1472
|
-
timeout_message='Marking request as handled timed out after '
|
|
1473
|
-
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1474
|
-
logger=self._logger,
|
|
1475
|
-
max_retries=3,
|
|
1476
|
-
)
|
|
1468
|
+
await self._mark_request_as_handled(request)
|
|
1477
1469
|
|
|
1478
1470
|
except ContextPipelineInitializationError as initialization_error:
|
|
1479
1471
|
self._logger.debug(
|
|
@@ -1491,12 +1483,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1491
1483
|
raise
|
|
1492
1484
|
|
|
1493
1485
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1494
|
-
await
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1486
|
+
await self._context_pipeline(
|
|
1487
|
+
context,
|
|
1488
|
+
lambda final_context: wait_for(
|
|
1489
|
+
lambda: self.router(final_context),
|
|
1490
|
+
timeout=self._request_handler_timeout,
|
|
1491
|
+
timeout_message=f'{self._request_handler_timeout_text}'
|
|
1492
|
+
f' {self._request_handler_timeout.total_seconds()} seconds',
|
|
1493
|
+
logger=self._logger,
|
|
1494
|
+
),
|
|
1500
1495
|
)
|
|
1501
1496
|
|
|
1502
1497
|
def _raise_for_error_status_code(self, status_code: int) -> None:
|
|
@@ -1644,3 +1639,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1644
1639
|
)
|
|
1645
1640
|
|
|
1646
1641
|
self._previous_crawler_state = current_state
|
|
1642
|
+
|
|
1643
|
+
async def _mark_request_as_handled(self, request: Request) -> None:
|
|
1644
|
+
request_manager = await self.get_request_manager()
|
|
1645
|
+
await wait_for(
|
|
1646
|
+
lambda: request_manager.mark_request_as_handled(request),
|
|
1647
|
+
timeout=self._internal_timeout,
|
|
1648
|
+
timeout_message='Marking request as handled timed out after '
|
|
1649
|
+
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1650
|
+
logger=self._logger,
|
|
1651
|
+
max_retries=3,
|
|
1652
|
+
)
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from bs4 import BeautifulSoup, Tag
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
|
|
11
11
|
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
|
|
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
|
|
|
58
58
|
self,
|
|
59
59
|
*,
|
|
60
60
|
parser: BeautifulSoupParserType = 'lxml',
|
|
61
|
-
**kwargs: Unpack[
|
|
61
|
+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
|
|
62
62
|
) -> None:
|
|
63
63
|
"""Initialize a new instance.
|
|
64
64
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
from parsel import Selector
|
|
6
6
|
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
8
|
-
from crawlee.crawlers import AbstractHttpCrawler,
|
|
8
|
+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
|
|
9
9
|
|
|
10
10
|
from ._parsel_crawling_context import ParselCrawlingContext
|
|
11
11
|
from ._parsel_parser import ParselParser
|
|
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
|
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
**kwargs: Unpack[
|
|
59
|
+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
|
|
60
60
|
) -> None:
|
|
61
61
|
"""Initialize a new instance.
|
|
62
62
|
|