crawlee 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_request.py +29 -10
- crawlee/_types.py +42 -2
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/recurring_task.py +2 -1
- crawlee/_utils/time.py +41 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +52 -14
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +135 -118
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +58 -17
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +1 -3
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/router.py +13 -3
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +10 -16
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +37 -36
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +1 -1
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from importlib import metadata
|
|
2
2
|
|
|
3
|
-
from ._request import Request, RequestOptions
|
|
3
|
+
from ._request import Request, RequestOptions, RequestState
|
|
4
4
|
from ._service_locator import service_locator
|
|
5
5
|
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
|
|
6
6
|
from ._utils.globs import Glob
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
'HttpHeaders',
|
|
15
15
|
'Request',
|
|
16
16
|
'RequestOptions',
|
|
17
|
+
'RequestState',
|
|
17
18
|
'RequestTransformAction',
|
|
18
19
|
'SkippedReason',
|
|
19
20
|
'service_locator',
|
crawlee/_request.py
CHANGED
|
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
|
|
|
34
34
|
class CrawleeRequestData(BaseModel):
|
|
35
35
|
"""Crawlee-specific configuration stored in the `user_data`."""
|
|
36
36
|
|
|
37
|
-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
|
|
37
|
+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
|
|
38
38
|
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
|
|
39
39
|
`BasicCrawler`."""
|
|
40
40
|
|
|
41
41
|
enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
|
|
42
42
|
"""The strategy that was used for enqueuing the request."""
|
|
43
43
|
|
|
44
|
-
state: RequestState
|
|
44
|
+
state: RequestState = RequestState.UNPROCESSED
|
|
45
45
|
"""Describes the request's current lifecycle state."""
|
|
46
46
|
|
|
47
47
|
session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
|
|
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
|
|
|
137
137
|
always_enqueue: NotRequired[bool]
|
|
138
138
|
user_data: NotRequired[dict[str, JsonSerializable]]
|
|
139
139
|
no_retry: NotRequired[bool]
|
|
140
|
+
enqueue_strategy: NotRequired[EnqueueStrategy]
|
|
141
|
+
max_retries: NotRequired[int | None]
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
@docs_group('Storage data')
|
|
@@ -166,7 +168,7 @@ class Request(BaseModel):
|
|
|
166
168
|
|
|
167
169
|
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
168
170
|
|
|
169
|
-
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
171
|
+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
|
|
170
172
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
|
171
173
|
to the same URL.
|
|
172
174
|
|
|
@@ -178,17 +180,18 @@ class Request(BaseModel):
|
|
|
178
180
|
and specify which URLs shall be considered equal.
|
|
179
181
|
"""
|
|
180
182
|
|
|
181
|
-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
183
|
+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
|
|
182
184
|
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
|
|
183
185
|
and fragments."""
|
|
184
186
|
|
|
185
|
-
method: HttpMethod = 'GET'
|
|
187
|
+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
|
|
186
188
|
"""HTTP request method."""
|
|
187
189
|
|
|
188
190
|
payload: Annotated[
|
|
189
191
|
HttpPayload | None,
|
|
190
192
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
191
193
|
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
|
|
194
|
+
Field(frozen=True),
|
|
192
195
|
] = None
|
|
193
196
|
"""HTTP request payload."""
|
|
194
197
|
|
|
@@ -250,6 +253,8 @@ class Request(BaseModel):
|
|
|
250
253
|
keep_url_fragment: bool = False,
|
|
251
254
|
use_extended_unique_key: bool = False,
|
|
252
255
|
always_enqueue: bool = False,
|
|
256
|
+
enqueue_strategy: EnqueueStrategy | None = None,
|
|
257
|
+
max_retries: int | None = None,
|
|
253
258
|
**kwargs: Any,
|
|
254
259
|
) -> Self:
|
|
255
260
|
"""Create a new `Request` instance from a URL.
|
|
@@ -277,6 +282,9 @@ class Request(BaseModel):
|
|
|
277
282
|
`unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
278
283
|
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
|
|
279
284
|
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
|
|
285
|
+
enqueue_strategy: The strategy that will be used for enqueuing the request.
|
|
286
|
+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
|
|
287
|
+
option of `BasicCrawler`.
|
|
280
288
|
**kwargs: Additional request properties.
|
|
281
289
|
"""
|
|
282
290
|
if unique_key is not None and always_enqueue:
|
|
@@ -301,12 +309,27 @@ class Request(BaseModel):
|
|
|
301
309
|
if always_enqueue:
|
|
302
310
|
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
303
311
|
|
|
312
|
+
user_data_dict = kwargs.pop('user_data', {}) or {}
|
|
313
|
+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
|
|
314
|
+
|
|
315
|
+
if max_retries is not None:
|
|
316
|
+
crawlee_data_dict['maxRetries'] = max_retries
|
|
317
|
+
|
|
318
|
+
if enqueue_strategy is not None:
|
|
319
|
+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
|
|
320
|
+
|
|
321
|
+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
|
|
322
|
+
|
|
323
|
+
if crawlee_data:
|
|
324
|
+
user_data_dict['__crawlee'] = crawlee_data
|
|
325
|
+
|
|
304
326
|
request = cls(
|
|
305
327
|
url=url,
|
|
306
328
|
unique_key=unique_key,
|
|
307
329
|
method=method,
|
|
308
330
|
headers=headers,
|
|
309
331
|
payload=payload,
|
|
332
|
+
user_data=user_data_dict,
|
|
310
333
|
**kwargs,
|
|
311
334
|
)
|
|
312
335
|
|
|
@@ -352,7 +375,7 @@ class Request(BaseModel):
|
|
|
352
375
|
self.crawlee_data.crawl_depth = new_value
|
|
353
376
|
|
|
354
377
|
@property
|
|
355
|
-
def state(self) -> RequestState
|
|
378
|
+
def state(self) -> RequestState:
|
|
356
379
|
"""Crawlee-specific request handling state."""
|
|
357
380
|
return self.crawlee_data.state
|
|
358
381
|
|
|
@@ -365,10 +388,6 @@ class Request(BaseModel):
|
|
|
365
388
|
"""Crawlee-specific limit on the number of retries of the request."""
|
|
366
389
|
return self.crawlee_data.max_retries
|
|
367
390
|
|
|
368
|
-
@max_retries.setter
|
|
369
|
-
def max_retries(self, new_max_retries: int) -> None:
|
|
370
|
-
self.crawlee_data.max_retries = new_max_retries
|
|
371
|
-
|
|
372
391
|
@property
|
|
373
392
|
def session_rotation_count(self) -> int | None:
|
|
374
393
|
"""Crawlee-specific number of finished session rotations for the request."""
|
crawlee/_types.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
|
+
from copy import deepcopy
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
7
8
|
|
|
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
15
16
|
import re
|
|
16
17
|
from collections.abc import Callable, Coroutine, Sequence
|
|
17
18
|
|
|
18
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
19
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
19
20
|
|
|
20
21
|
from crawlee import Glob, Request
|
|
21
22
|
from crawlee._request import RequestOptions
|
|
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
|
|
|
260
261
|
class RequestHandlerRunResult:
|
|
261
262
|
"""Record of calls to storage-related context helpers."""
|
|
262
263
|
|
|
263
|
-
def __init__(
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
*,
|
|
267
|
+
key_value_store_getter: GetKeyValueStoreFunction,
|
|
268
|
+
request: Request,
|
|
269
|
+
) -> None:
|
|
264
270
|
self._key_value_store_getter = key_value_store_getter
|
|
265
271
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
266
272
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
267
273
|
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
268
274
|
|
|
275
|
+
# Isolated copies for handler execution
|
|
276
|
+
self._request = deepcopy(request)
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def request(self) -> Request:
|
|
280
|
+
return self._request
|
|
281
|
+
|
|
269
282
|
async def add_requests(
|
|
270
283
|
self,
|
|
271
284
|
requests: Sequence[str | Request],
|
|
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
|
|
|
315
328
|
|
|
316
329
|
return self.key_value_store_changes[id, name, alias]
|
|
317
330
|
|
|
331
|
+
def apply_request_changes(self, target: Request) -> None:
|
|
332
|
+
"""Apply tracked changes from handler copy to original request."""
|
|
333
|
+
if self.request.user_data != target.user_data:
|
|
334
|
+
target.user_data = self.request.user_data
|
|
335
|
+
|
|
336
|
+
if self.request.headers != target.headers:
|
|
337
|
+
target.headers = self.request.headers
|
|
338
|
+
|
|
318
339
|
|
|
319
340
|
@docs_group('Functions')
|
|
320
341
|
class AddRequestsFunction(Protocol):
|
|
@@ -643,6 +664,25 @@ class BasicCrawlingContext:
|
|
|
643
664
|
"""Return hash of the context. Each context is considered unique."""
|
|
644
665
|
return id(self)
|
|
645
666
|
|
|
667
|
+
def create_modified_copy(
|
|
668
|
+
self,
|
|
669
|
+
push_data: PushDataFunction | None = None,
|
|
670
|
+
add_requests: AddRequestsFunction | None = None,
|
|
671
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
672
|
+
) -> Self:
|
|
673
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
674
|
+
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
|
|
675
|
+
modified_fields = {
|
|
676
|
+
key: value
|
|
677
|
+
for key, value in {
|
|
678
|
+
'push_data': push_data,
|
|
679
|
+
'add_requests': add_requests,
|
|
680
|
+
'get_key_value_store': get_key_value_store,
|
|
681
|
+
}.items()
|
|
682
|
+
if value
|
|
683
|
+
}
|
|
684
|
+
return self.__class__(**{**original_fields, **modified_fields})
|
|
685
|
+
|
|
646
686
|
|
|
647
687
|
class GetDataKwargs(TypedDict):
|
|
648
688
|
"""Keyword arguments for dataset's `get_data` method."""
|
crawlee/_utils/context.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import inspect
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from functools import wraps
|
|
6
6
|
from typing import Any, TypeVar
|
|
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
|
|
|
44
44
|
|
|
45
45
|
return await method(self, *args, **kwargs)
|
|
46
46
|
|
|
47
|
-
return async_wrapper if
|
|
47
|
+
return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
|
crawlee/_utils/file.py
CHANGED
|
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
|
|
|
163
163
|
dst: TextIO,
|
|
164
164
|
**kwargs: Unpack[ExportDataCsvKwargs],
|
|
165
165
|
) -> None:
|
|
166
|
+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
|
|
167
|
+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
|
|
168
|
+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
|
|
169
|
+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
|
|
170
|
+
if 'lineterminator' not in kwargs:
|
|
171
|
+
kwargs['lineterminator'] = '\n'
|
|
172
|
+
|
|
166
173
|
writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
|
|
167
174
|
write_header = True
|
|
168
175
|
|
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import inspect
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
@@ -49,7 +50,7 @@ class RecurringTask:
|
|
|
49
50
|
"""
|
|
50
51
|
sleep_time_secs = self.delay.total_seconds()
|
|
51
52
|
while True:
|
|
52
|
-
await self.func() if
|
|
53
|
+
await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
|
|
53
54
|
await asyncio.sleep(sleep_time_secs)
|
|
54
55
|
|
|
55
56
|
def start(self) -> None:
|
crawlee/_utils/time.py
CHANGED
|
@@ -3,11 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import time
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
9
|
+
from async_timeout import Timeout, timeout
|
|
10
|
+
|
|
8
11
|
if TYPE_CHECKING:
|
|
9
12
|
from collections.abc import Iterator
|
|
10
|
-
from
|
|
13
|
+
from types import TracebackType
|
|
11
14
|
|
|
12
15
|
_SECONDS_PER_MINUTE = 60
|
|
13
16
|
_SECONDS_PER_HOUR = 3600
|
|
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
|
|
|
35
38
|
result.cpu = after_cpu - before_cpu
|
|
36
39
|
|
|
37
40
|
|
|
41
|
+
class SharedTimeout:
|
|
42
|
+
"""Keeps track of a time budget shared by multiple independent async operations.
|
|
43
|
+
|
|
44
|
+
Provides a reusable, non-reentrant context manager interface.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, timeout: timedelta) -> None:
|
|
48
|
+
self._remaining_timeout = timeout
|
|
49
|
+
self._active_timeout: Timeout | None = None
|
|
50
|
+
self._activation_timestamp: float | None = None
|
|
51
|
+
|
|
52
|
+
async def __aenter__(self) -> timedelta:
|
|
53
|
+
if self._active_timeout is not None or self._activation_timestamp is not None:
|
|
54
|
+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
|
|
55
|
+
|
|
56
|
+
self._activation_timestamp = time.monotonic()
|
|
57
|
+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
|
|
58
|
+
await new_timeout.__aenter__()
|
|
59
|
+
return self._remaining_timeout
|
|
60
|
+
|
|
61
|
+
async def __aexit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: type[BaseException] | None,
|
|
64
|
+
exc_value: BaseException | None,
|
|
65
|
+
exc_traceback: TracebackType | None,
|
|
66
|
+
) -> None:
|
|
67
|
+
if self._active_timeout is None or self._activation_timestamp is None:
|
|
68
|
+
raise RuntimeError('Logic error')
|
|
69
|
+
|
|
70
|
+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
|
|
71
|
+
elapsed = time.monotonic() - self._activation_timestamp
|
|
72
|
+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
|
|
73
|
+
|
|
74
|
+
self._active_timeout = None
|
|
75
|
+
self._activation_timestamp = None
|
|
76
|
+
|
|
77
|
+
|
|
38
78
|
def format_duration(duration: timedelta | None) -> str:
|
|
39
79
|
"""Format a timedelta into a human-readable string with appropriate units."""
|
|
40
80
|
if duration is None:
|
crawlee/crawlers/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
2
|
from crawlee._utils.try_import import try_import as _try_import
|
|
3
3
|
|
|
4
|
-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
|
|
4
|
+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
|
|
5
5
|
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
|
|
6
6
|
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
|
|
7
7
|
|
|
@@ -51,6 +51,7 @@ __all__ = [
|
|
|
51
51
|
'BeautifulSoupParserType',
|
|
52
52
|
'ContextPipeline',
|
|
53
53
|
'HttpCrawler',
|
|
54
|
+
'HttpCrawlerOptions',
|
|
54
55
|
'HttpCrawlingContext',
|
|
55
56
|
'HttpCrawlingResult',
|
|
56
57
|
'ParsedHttpCrawlingContext',
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from ._abstract_http_crawler import AbstractHttpCrawler
|
|
1
|
+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
|
|
2
2
|
from ._abstract_http_parser import AbstractHttpParser
|
|
3
3
|
from ._http_crawling_context import ParsedHttpCrawlingContext
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
'AbstractHttpCrawler',
|
|
7
7
|
'AbstractHttpParser',
|
|
8
|
+
'HttpCrawlerOptions',
|
|
8
9
|
'ParsedHttpCrawlingContext',
|
|
9
10
|
]
|
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
|
-
from crawlee._request import Request, RequestOptions
|
|
13
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
|
|
|
112
130
|
async def _execute_pre_navigation_hooks(
|
|
113
131
|
self, context: BasicCrawlingContext
|
|
114
132
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
133
|
+
context_id = id(context)
|
|
134
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for hook in self._pre_navigation_hooks:
|
|
138
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
139
|
+
await hook(context)
|
|
140
|
+
|
|
141
|
+
yield context
|
|
142
|
+
finally:
|
|
143
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
144
|
|
|
119
145
|
async def _parse_http_response(
|
|
120
146
|
self, context: HttpCrawlingContext
|
|
@@ -165,11 +191,18 @@ class AbstractHttpCrawler(
|
|
|
165
191
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
166
192
|
|
|
167
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
194
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
168
195
|
|
|
169
196
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
171
|
-
|
|
197
|
+
|
|
198
|
+
# Get base URL from <base> tag if present
|
|
199
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
200
|
+
base_url: str = (
|
|
201
|
+
str(extracted_base_urls[0])
|
|
202
|
+
if extracted_base_urls
|
|
203
|
+
else context.request.loaded_url or context.request.url
|
|
172
204
|
)
|
|
205
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
173
206
|
|
|
174
207
|
if robots_txt_file:
|
|
175
208
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -177,7 +210,9 @@ class AbstractHttpCrawler(
|
|
|
177
210
|
skipped = iter([])
|
|
178
211
|
|
|
179
212
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
180
|
-
request_options = RequestOptions(
|
|
213
|
+
request_options = RequestOptions(
|
|
214
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
215
|
+
)
|
|
181
216
|
|
|
182
217
|
if transform_request_function:
|
|
183
218
|
transform_request_options = transform_request_function(request_options)
|
|
@@ -216,13 +251,16 @@ class AbstractHttpCrawler(
|
|
|
216
251
|
Yields:
|
|
217
252
|
The original crawling context enhanced by HTTP response.
|
|
218
253
|
"""
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
254
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
255
|
+
result = await self._http_client.crawl(
|
|
256
|
+
request=context.request,
|
|
257
|
+
session=context.session,
|
|
258
|
+
proxy_info=context.proxy_info,
|
|
259
|
+
statistics=self._statistics,
|
|
260
|
+
timeout=remaining_timeout,
|
|
261
|
+
)
|
|
225
262
|
|
|
263
|
+
context.request.state = RequestState.AFTER_NAV
|
|
226
264
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
227
265
|
|
|
228
266
|
async def _handle_status_code_response(
|
|
@@ -290,11 +290,14 @@ class AdaptivePlaywrightCrawler(
|
|
|
290
290
|
use_state_function = context.use_state
|
|
291
291
|
|
|
292
292
|
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
|
|
293
|
-
result = RequestHandlerRunResult(
|
|
293
|
+
result = RequestHandlerRunResult(
|
|
294
|
+
key_value_store_getter=self.get_key_value_store,
|
|
295
|
+
request=context.request,
|
|
296
|
+
)
|
|
294
297
|
context_linked_to_result = BasicCrawlingContext(
|
|
295
|
-
request=
|
|
296
|
-
session=
|
|
297
|
-
proxy_info=
|
|
298
|
+
request=result.request,
|
|
299
|
+
session=context.session,
|
|
300
|
+
proxy_info=context.proxy_info,
|
|
298
301
|
send_request=context.send_request,
|
|
299
302
|
add_requests=result.add_requests,
|
|
300
303
|
push_data=result.push_data,
|
|
@@ -314,7 +317,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
314
317
|
),
|
|
315
318
|
logger=self._logger,
|
|
316
319
|
)
|
|
317
|
-
return SubCrawlerRun(result=result
|
|
320
|
+
return SubCrawlerRun(result=result)
|
|
318
321
|
except Exception as e:
|
|
319
322
|
return SubCrawlerRun(exception=e)
|
|
320
323
|
|
|
@@ -370,8 +373,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
370
373
|
self.track_http_only_request_handler_runs()
|
|
371
374
|
|
|
372
375
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
373
|
-
if static_run.result and
|
|
374
|
-
self._update_context_from_copy(context, static_run.run_context)
|
|
376
|
+
if static_run.result and self.result_checker(static_run.result):
|
|
375
377
|
self._context_result_map[context] = static_run.result
|
|
376
378
|
return
|
|
377
379
|
if static_run.exception:
|
|
@@ -402,7 +404,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
404
|
if pw_run.exception is not None:
|
|
403
405
|
raise pw_run.exception
|
|
404
406
|
|
|
405
|
-
if pw_run.result
|
|
407
|
+
if pw_run.result:
|
|
406
408
|
if should_detect_rendering_type:
|
|
407
409
|
detection_result: RenderingType
|
|
408
410
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
@@ -414,7 +416,6 @@ class AdaptivePlaywrightCrawler(
|
|
|
414
416
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
415
417
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
416
418
|
|
|
417
|
-
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
419
|
self._context_result_map[context] = pw_run.result
|
|
419
420
|
|
|
420
421
|
def pre_navigation_hook(
|
|
@@ -451,32 +452,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
452
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
453
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
454
|
|
|
454
|
-
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
-
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
-
|
|
457
|
-
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
-
allowing state synchronization after isolated crawler execution.
|
|
459
|
-
"""
|
|
460
|
-
updating_attributes = {
|
|
461
|
-
'request': ('headers', 'user_data'),
|
|
462
|
-
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
-
original_sub_obj = getattr(context, attr)
|
|
467
|
-
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
-
|
|
469
|
-
# Check that both sub objects are not None
|
|
470
|
-
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
-
continue
|
|
472
|
-
|
|
473
|
-
for sub_attr in sub_attrs:
|
|
474
|
-
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
-
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
-
|
|
477
455
|
|
|
478
456
|
@dataclass(frozen=True)
|
|
479
457
|
class SubCrawlerRun:
|
|
480
458
|
result: RequestHandlerRunResult | None = None
|
|
481
459
|
exception: Exception | None = None
|
|
482
|
-
run_context: BasicCrawlingContext | None = None
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|