crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/sitemap.py
CHANGED
|
@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
|
|
|
9
9
|
from hashlib import sha256
|
|
10
10
|
from logging import getLogger
|
|
11
11
|
from typing import TYPE_CHECKING, Literal, TypedDict
|
|
12
|
+
from xml.sax import SAXParseException
|
|
12
13
|
from xml.sax.expatreader import ExpatParser
|
|
13
14
|
from xml.sax.handler import ContentHandler
|
|
14
15
|
|
|
@@ -192,7 +193,8 @@ class _XmlSitemapParser:
|
|
|
192
193
|
|
|
193
194
|
def close(self) -> None:
|
|
194
195
|
"""Clean up resources."""
|
|
195
|
-
|
|
196
|
+
with suppress(SAXParseException):
|
|
197
|
+
self._parser.close()
|
|
196
198
|
|
|
197
199
|
|
|
198
200
|
def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
|
|
@@ -333,7 +335,7 @@ async def _fetch_and_process_sitemap(
|
|
|
333
335
|
# Check if the first chunk is a valid gzip header
|
|
334
336
|
if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
|
|
335
337
|
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
|
|
336
|
-
|
|
338
|
+
first_chunk = False
|
|
337
339
|
|
|
338
340
|
chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
|
|
339
341
|
text_chunk = decoder.decode(chunk)
|
|
@@ -428,10 +430,17 @@ async def parse_sitemap(
|
|
|
428
430
|
up to the specified maximum depth.
|
|
429
431
|
"""
|
|
430
432
|
# Set default options
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
433
|
+
default_timeout = timedelta(seconds=30)
|
|
434
|
+
if options:
|
|
435
|
+
emit_nested_sitemaps = options['emit_nested_sitemaps']
|
|
436
|
+
max_depth = options['max_depth']
|
|
437
|
+
sitemap_retries = options['sitemap_retries']
|
|
438
|
+
timeout = options.get('timeout', default_timeout)
|
|
439
|
+
else:
|
|
440
|
+
emit_nested_sitemaps = False
|
|
441
|
+
max_depth = float('inf')
|
|
442
|
+
sitemap_retries = 3
|
|
443
|
+
timeout = default_timeout
|
|
435
444
|
|
|
436
445
|
# Setup working state
|
|
437
446
|
sources = list(initial_sources)
|
|
@@ -470,7 +479,7 @@ async def parse_sitemap(
|
|
|
470
479
|
sitemap_retries,
|
|
471
480
|
emit_nested_sitemaps=emit_nested_sitemaps,
|
|
472
481
|
proxy_info=proxy_info,
|
|
473
|
-
timeout=
|
|
482
|
+
timeout=timeout,
|
|
474
483
|
):
|
|
475
484
|
yield result
|
|
476
485
|
else:
|
crawlee/_utils/system.py
CHANGED
|
@@ -5,7 +5,7 @@ import sys
|
|
|
5
5
|
from contextlib import suppress
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from logging import getLogger
|
|
8
|
-
from typing import Annotated
|
|
8
|
+
from typing import TYPE_CHECKING, Annotated
|
|
9
9
|
|
|
10
10
|
import psutil
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
|
|
@@ -36,22 +36,30 @@ else:
|
|
|
36
36
|
class CpuInfo(BaseModel):
|
|
37
37
|
"""Information about the CPU usage."""
|
|
38
38
|
|
|
39
|
-
model_config = ConfigDict(
|
|
39
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
40
40
|
|
|
41
41
|
used_ratio: Annotated[float, Field(alias='usedRatio')]
|
|
42
42
|
"""The ratio of CPU currently in use, represented as a float between 0 and 1."""
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
44
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
45
|
+
if TYPE_CHECKING:
|
|
46
|
+
created_at: datetime = datetime.now(timezone.utc)
|
|
47
|
+
"""The time at which the measurement was taken."""
|
|
48
|
+
else:
|
|
49
|
+
created_at: Annotated[
|
|
50
|
+
datetime,
|
|
51
|
+
Field(
|
|
52
|
+
alias='createdAt',
|
|
53
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
54
|
+
),
|
|
55
|
+
]
|
|
56
|
+
"""The time at which the measurement was taken."""
|
|
49
57
|
|
|
50
58
|
|
|
51
59
|
class MemoryUsageInfo(BaseModel):
|
|
52
60
|
"""Information about the memory usage."""
|
|
53
61
|
|
|
54
|
-
model_config = ConfigDict(
|
|
62
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
55
63
|
|
|
56
64
|
current_size: Annotated[
|
|
57
65
|
ByteSize,
|
|
@@ -61,17 +69,25 @@ class MemoryUsageInfo(BaseModel):
|
|
|
61
69
|
]
|
|
62
70
|
"""Memory usage of the current Python process and its children."""
|
|
63
71
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
72
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
73
|
+
if TYPE_CHECKING:
|
|
74
|
+
created_at: datetime = datetime.now(timezone.utc)
|
|
75
|
+
"""The time at which the measurement was taken."""
|
|
76
|
+
else:
|
|
77
|
+
created_at: Annotated[
|
|
78
|
+
datetime,
|
|
79
|
+
Field(
|
|
80
|
+
alias='createdAt',
|
|
81
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
82
|
+
),
|
|
83
|
+
]
|
|
84
|
+
"""The time at which the measurement was taken."""
|
|
69
85
|
|
|
70
86
|
|
|
71
87
|
class MemoryInfo(MemoryUsageInfo):
|
|
72
88
|
"""Information about system memory."""
|
|
73
89
|
|
|
74
|
-
model_config = ConfigDict(
|
|
90
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
75
91
|
|
|
76
92
|
total_size: Annotated[
|
|
77
93
|
ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
|
crawlee/_utils/time.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import timedelta
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from async_timeout import Timeout, timeout
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from types import TracebackType
|
|
14
|
+
|
|
15
|
+
_SECONDS_PER_MINUTE = 60
|
|
16
|
+
_SECONDS_PER_HOUR = 3600
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class TimerResult:
|
|
21
|
+
wall: float | None = None
|
|
22
|
+
cpu: float | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def measure_time() -> Iterator[TimerResult]:
|
|
27
|
+
"""Measure the execution time (wall-clock and CPU) between the start and end of the with-block."""
|
|
28
|
+
result = TimerResult()
|
|
29
|
+
before_wall = time.monotonic()
|
|
30
|
+
before_cpu = time.thread_time()
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
yield result
|
|
34
|
+
finally:
|
|
35
|
+
after_wall = time.monotonic()
|
|
36
|
+
after_cpu = time.thread_time()
|
|
37
|
+
result.wall = after_wall - before_wall
|
|
38
|
+
result.cpu = after_cpu - before_cpu
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SharedTimeout:
|
|
42
|
+
"""Keeps track of a time budget shared by multiple independent async operations.
|
|
43
|
+
|
|
44
|
+
Provides a reusable, non-reentrant context manager interface.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, timeout: timedelta) -> None:
|
|
48
|
+
self._remaining_timeout = timeout
|
|
49
|
+
self._active_timeout: Timeout | None = None
|
|
50
|
+
self._activation_timestamp: float | None = None
|
|
51
|
+
|
|
52
|
+
async def __aenter__(self) -> timedelta:
|
|
53
|
+
if self._active_timeout is not None or self._activation_timestamp is not None:
|
|
54
|
+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
|
|
55
|
+
|
|
56
|
+
self._activation_timestamp = time.monotonic()
|
|
57
|
+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
|
|
58
|
+
await new_timeout.__aenter__()
|
|
59
|
+
return self._remaining_timeout
|
|
60
|
+
|
|
61
|
+
async def __aexit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: type[BaseException] | None,
|
|
64
|
+
exc_value: BaseException | None,
|
|
65
|
+
exc_traceback: TracebackType | None,
|
|
66
|
+
) -> None:
|
|
67
|
+
if self._active_timeout is None or self._activation_timestamp is None:
|
|
68
|
+
raise RuntimeError('Logic error')
|
|
69
|
+
|
|
70
|
+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
|
|
71
|
+
elapsed = time.monotonic() - self._activation_timestamp
|
|
72
|
+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
|
|
73
|
+
|
|
74
|
+
self._active_timeout = None
|
|
75
|
+
self._activation_timestamp = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def format_duration(duration: timedelta | None) -> str:
|
|
79
|
+
"""Format a timedelta into a human-readable string with appropriate units."""
|
|
80
|
+
if duration is None:
|
|
81
|
+
return 'None'
|
|
82
|
+
|
|
83
|
+
total_seconds = duration.total_seconds()
|
|
84
|
+
|
|
85
|
+
if total_seconds == 0:
|
|
86
|
+
return '0s'
|
|
87
|
+
|
|
88
|
+
# For very small durations, show in milliseconds
|
|
89
|
+
if total_seconds < 1:
|
|
90
|
+
milliseconds = total_seconds * 1000
|
|
91
|
+
if milliseconds < 1:
|
|
92
|
+
microseconds = total_seconds * 1_000_000
|
|
93
|
+
return f'{microseconds:.1f}μs'
|
|
94
|
+
return f'{milliseconds:.1f}ms'
|
|
95
|
+
|
|
96
|
+
# For durations less than 60 seconds, show in seconds
|
|
97
|
+
if total_seconds < _SECONDS_PER_MINUTE:
|
|
98
|
+
return f'{total_seconds:.2f}s'
|
|
99
|
+
|
|
100
|
+
# For durations less than 1 hour, show in minutes and seconds
|
|
101
|
+
if total_seconds < _SECONDS_PER_HOUR:
|
|
102
|
+
minutes = int(total_seconds // _SECONDS_PER_MINUTE)
|
|
103
|
+
seconds = total_seconds % _SECONDS_PER_MINUTE
|
|
104
|
+
if seconds == 0:
|
|
105
|
+
return f'{minutes}min'
|
|
106
|
+
return f'{minutes}min {seconds:.1f}s'
|
|
107
|
+
|
|
108
|
+
# For longer durations, show in hours, minutes, and seconds
|
|
109
|
+
hours = int(total_seconds // _SECONDS_PER_HOUR)
|
|
110
|
+
remaining_seconds = total_seconds % _SECONDS_PER_HOUR
|
|
111
|
+
minutes = int(remaining_seconds // _SECONDS_PER_MINUTE)
|
|
112
|
+
seconds = remaining_seconds % _SECONDS_PER_MINUTE
|
|
113
|
+
|
|
114
|
+
result = f'{hours}h'
|
|
115
|
+
if minutes > 0:
|
|
116
|
+
result += f' {minutes}min'
|
|
117
|
+
if seconds > 0:
|
|
118
|
+
result += f' {seconds:.1f}s'
|
|
119
|
+
|
|
120
|
+
return result
|
crawlee/_utils/urls.py
CHANGED
|
@@ -7,6 +7,7 @@ from yarl import URL
|
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from collections.abc import Iterator
|
|
10
|
+
from logging import Logger
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def is_url_absolute(url: str) -> bool:
|
|
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
|
|
|
22
23
|
return str(URL(base_url).join(URL(relative_url)))
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
|
|
26
|
+
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
|
|
26
27
|
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
|
|
27
28
|
for url in urls:
|
|
28
29
|
if is_url_absolute(url):
|
|
29
30
|
yield url
|
|
30
31
|
else:
|
|
31
|
-
|
|
32
|
+
converted_url = convert_to_absolute_url(base_url, url)
|
|
33
|
+
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
|
|
34
|
+
if not is_url_absolute(converted_url):
|
|
35
|
+
if logger:
|
|
36
|
+
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
|
|
37
|
+
continue
|
|
38
|
+
yield converted_url
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
_http_url_adapter = TypeAdapter(AnyHttpUrl)
|
|
@@ -118,7 +118,10 @@ class BrowserPool:
|
|
|
118
118
|
"""Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
|
|
119
119
|
|
|
120
120
|
Args:
|
|
121
|
-
browser_type: The type of browser to launch
|
|
121
|
+
browser_type: The type of browser to launch:
|
|
122
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
123
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
124
|
+
the system.
|
|
122
125
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
123
126
|
and local storage.
|
|
124
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -135,7 +138,7 @@ class BrowserPool:
|
|
|
135
138
|
kwargs: Additional arguments for default constructor.
|
|
136
139
|
"""
|
|
137
140
|
plugin_options: dict = defaultdict(dict)
|
|
138
|
-
plugin_options['browser_launch_options'] = browser_launch_options
|
|
141
|
+
plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
|
|
139
142
|
plugin_options['browser_new_context_options'] = browser_new_context_options or {}
|
|
140
143
|
|
|
141
144
|
if headless is not None:
|
|
@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
|
|
|
78
78
|
|
|
79
79
|
async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
|
|
80
80
|
if self._temp_dir and self._temp_dir.exists():
|
|
81
|
-
|
|
81
|
+
temp_dir = self._temp_dir
|
|
82
|
+
await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
|
|
82
83
|
|
|
83
84
|
@override
|
|
84
85
|
async def close(self, **kwargs: Any) -> None:
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from asyncio import Lock
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
77
78
|
|
|
78
79
|
self._total_opened_pages = 0
|
|
79
80
|
|
|
81
|
+
self._context_creation_lock: Lock | None = None
|
|
82
|
+
|
|
83
|
+
async def _get_context_creation_lock(self) -> Lock:
|
|
84
|
+
"""Get context checking and creation lock.
|
|
85
|
+
|
|
86
|
+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
|
|
87
|
+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
|
|
88
|
+
"""
|
|
89
|
+
if self._context_creation_lock:
|
|
90
|
+
return self._context_creation_lock
|
|
91
|
+
self._context_creation_lock = Lock()
|
|
92
|
+
return self._context_creation_lock
|
|
93
|
+
|
|
80
94
|
@property
|
|
81
95
|
@override
|
|
82
96
|
def pages(self) -> list[Page]:
|
|
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
137
151
|
Raises:
|
|
138
152
|
ValueError: If the browser has reached the maximum number of open pages.
|
|
139
153
|
"""
|
|
140
|
-
if not self._browser_context:
|
|
141
|
-
self._browser_context = await self._create_browser_context(
|
|
142
|
-
browser_new_context_options=browser_new_context_options,
|
|
143
|
-
proxy_info=proxy_info,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
154
|
if not self.has_free_capacity:
|
|
147
155
|
raise ValueError('Cannot open more pages in this browser.')
|
|
148
156
|
|
|
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
154
162
|
)
|
|
155
163
|
page = await new_context.new_page()
|
|
156
164
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async with await self._get_context_creation_lock():
|
|
166
|
+
if not self._browser_context:
|
|
167
|
+
self._browser_context = await self._create_browser_context(
|
|
168
|
+
browser_new_context_options=browser_new_context_options,
|
|
169
|
+
proxy_info=proxy_info,
|
|
170
|
+
)
|
|
162
171
|
page = await self._browser_context.new_page()
|
|
163
172
|
|
|
164
173
|
# Handle page close event
|
|
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
169
178
|
self._last_page_opened_at = datetime.now(timezone.utc)
|
|
170
179
|
|
|
171
180
|
self._total_opened_pages += 1
|
|
172
|
-
|
|
173
181
|
return page
|
|
174
182
|
|
|
175
183
|
@override
|
|
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
206
214
|
`self._fingerprint_generator` is available.
|
|
207
215
|
"""
|
|
208
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
209
|
-
|
|
210
217
|
if proxy_info:
|
|
211
218
|
if browser_new_context_options.get('proxy'):
|
|
212
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
213
220
|
|
|
214
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
215
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
244
251
|
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
|
|
245
252
|
'extra_http_headers', extra_http_headers
|
|
246
253
|
)
|
|
247
|
-
|
|
248
254
|
return await self._browser.new_context(**browser_new_context_options)
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
crawlee/configuration.py
CHANGED
|
@@ -28,6 +28,8 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
31
33
|
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
crawlee/crawlers/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
2
|
from crawlee._utils.try_import import try_import as _try_import
|
|
3
3
|
|
|
4
|
-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
|
|
4
|
+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
|
|
5
5
|
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
|
|
6
6
|
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
|
|
7
7
|
|
|
@@ -23,12 +23,14 @@ with _try_import(
|
|
|
23
23
|
'AdaptivePlaywrightCrawler',
|
|
24
24
|
'AdaptivePlaywrightCrawlingContext',
|
|
25
25
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
26
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
26
27
|
'RenderingType',
|
|
27
28
|
'RenderingTypePrediction',
|
|
28
29
|
'RenderingTypePredictor',
|
|
29
30
|
):
|
|
30
31
|
from ._adaptive_playwright import (
|
|
31
32
|
AdaptivePlaywrightCrawler,
|
|
33
|
+
AdaptivePlaywrightCrawlerStatisticState,
|
|
32
34
|
AdaptivePlaywrightCrawlingContext,
|
|
33
35
|
AdaptivePlaywrightPreNavCrawlingContext,
|
|
34
36
|
RenderingType,
|
|
@@ -41,6 +43,7 @@ __all__ = [
|
|
|
41
43
|
'AbstractHttpCrawler',
|
|
42
44
|
'AbstractHttpParser',
|
|
43
45
|
'AdaptivePlaywrightCrawler',
|
|
46
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
44
47
|
'AdaptivePlaywrightCrawlingContext',
|
|
45
48
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
46
49
|
'BasicCrawler',
|
|
@@ -51,6 +54,7 @@ __all__ = [
|
|
|
51
54
|
'BeautifulSoupParserType',
|
|
52
55
|
'ContextPipeline',
|
|
53
56
|
'HttpCrawler',
|
|
57
|
+
'HttpCrawlerOptions',
|
|
54
58
|
'HttpCrawlingContext',
|
|
55
59
|
'HttpCrawlingResult',
|
|
56
60
|
'ParsedHttpCrawlingContext',
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from ._abstract_http_crawler import AbstractHttpCrawler
|
|
1
|
+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
|
|
2
2
|
from ._abstract_http_parser import AbstractHttpParser
|
|
3
3
|
from ._http_crawling_context import ParsedHttpCrawlingContext
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
'AbstractHttpCrawler',
|
|
7
7
|
'AbstractHttpParser',
|
|
8
|
+
'HttpCrawlerOptions',
|
|
8
9
|
'ParsedHttpCrawlingContext',
|
|
9
10
|
]
|
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
|
-
from crawlee._request import Request, RequestOptions
|
|
13
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,9 +34,24 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
|
-
|
|
52
|
+
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
53
|
+
ABC,
|
|
54
|
+
Generic[TCrawlingContext, TParseResult, TSelectResult],
|
|
38
55
|
):
|
|
39
56
|
"""A web crawler for performing HTTP requests.
|
|
40
57
|
|
|
@@ -54,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
54
71
|
self,
|
|
55
72
|
*,
|
|
56
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
57
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
58
76
|
) -> None:
|
|
59
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
60
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
61
81
|
|
|
62
82
|
if '_context_pipeline' not in kwargs:
|
|
63
83
|
raise ValueError(
|
|
@@ -80,9 +100,7 @@ class AbstractHttpCrawler(
|
|
|
80
100
|
this method simplifies cases where `TParseResult` is used for both generic parameters.
|
|
81
101
|
"""
|
|
82
102
|
|
|
83
|
-
class _ParsedHttpCrawler(
|
|
84
|
-
AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
|
|
85
|
-
):
|
|
103
|
+
class _ParsedHttpCrawler(AbstractHttpCrawler):
|
|
86
104
|
def __init__(
|
|
87
105
|
self,
|
|
88
106
|
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
|
|
@@ -110,9 +128,17 @@ class AbstractHttpCrawler(
|
|
|
110
128
|
async def _execute_pre_navigation_hooks(
|
|
111
129
|
self, context: BasicCrawlingContext
|
|
112
130
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
131
|
+
context_id = id(context)
|
|
132
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
for hook in self._pre_navigation_hooks:
|
|
136
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
137
|
+
await hook(context)
|
|
138
|
+
|
|
139
|
+
yield context
|
|
140
|
+
finally:
|
|
141
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
116
142
|
|
|
117
143
|
async def _parse_http_response(
|
|
118
144
|
self, context: HttpCrawlingContext
|
|
@@ -163,9 +189,18 @@ class AbstractHttpCrawler(
|
|
|
163
189
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
164
190
|
|
|
165
191
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
192
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
166
193
|
|
|
167
194
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
168
|
-
|
|
195
|
+
|
|
196
|
+
# Get base URL from <base> tag if present
|
|
197
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
198
|
+
base_url: str = (
|
|
199
|
+
str(extracted_base_urls[0])
|
|
200
|
+
if extracted_base_urls
|
|
201
|
+
else context.request.loaded_url or context.request.url
|
|
202
|
+
)
|
|
203
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
169
204
|
|
|
170
205
|
if robots_txt_file:
|
|
171
206
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -173,7 +208,9 @@ class AbstractHttpCrawler(
|
|
|
173
208
|
skipped = iter([])
|
|
174
209
|
|
|
175
210
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
176
|
-
request_options = RequestOptions(
|
|
211
|
+
request_options = RequestOptions(
|
|
212
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
213
|
+
)
|
|
177
214
|
|
|
178
215
|
if transform_request_function:
|
|
179
216
|
transform_request_options = transform_request_function(request_options)
|
|
@@ -212,13 +249,16 @@ class AbstractHttpCrawler(
|
|
|
212
249
|
Yields:
|
|
213
250
|
The original crawling context enhanced by HTTP response.
|
|
214
251
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
252
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
253
|
+
result = await self._http_client.crawl(
|
|
254
|
+
request=context.request,
|
|
255
|
+
session=context.session,
|
|
256
|
+
proxy_info=context.proxy_info,
|
|
257
|
+
statistics=self._statistics,
|
|
258
|
+
timeout=remaining_timeout,
|
|
259
|
+
)
|
|
221
260
|
|
|
261
|
+
context.request.state = RequestState.AFTER_NAV
|
|
222
262
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
223
263
|
|
|
224
264
|
async def _handle_status_code_response(
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@docs_group('HTTP parsers')
|
|
19
|
-
class AbstractHttpParser(Generic[TParseResult, TSelectResult]
|
|
19
|
+
class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
|
|
20
20
|
"""Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
|
|
21
21
|
|
|
22
22
|
@abstractmethod
|