crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +156 -131
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_key_value_store.py +5 -2
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/time.py
CHANGED
|
@@ -3,11 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import time
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
9
|
+
from async_timeout import Timeout, timeout
|
|
10
|
+
|
|
8
11
|
if TYPE_CHECKING:
|
|
9
12
|
from collections.abc import Iterator
|
|
10
|
-
from
|
|
13
|
+
from types import TracebackType
|
|
11
14
|
|
|
12
15
|
_SECONDS_PER_MINUTE = 60
|
|
13
16
|
_SECONDS_PER_HOUR = 3600
|
|
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
|
|
|
35
38
|
result.cpu = after_cpu - before_cpu
|
|
36
39
|
|
|
37
40
|
|
|
41
|
+
class SharedTimeout:
|
|
42
|
+
"""Keeps track of a time budget shared by multiple independent async operations.
|
|
43
|
+
|
|
44
|
+
Provides a reusable, non-reentrant context manager interface.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, timeout: timedelta) -> None:
|
|
48
|
+
self._remaining_timeout = timeout
|
|
49
|
+
self._active_timeout: Timeout | None = None
|
|
50
|
+
self._activation_timestamp: float | None = None
|
|
51
|
+
|
|
52
|
+
async def __aenter__(self) -> timedelta:
|
|
53
|
+
if self._active_timeout is not None or self._activation_timestamp is not None:
|
|
54
|
+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
|
|
55
|
+
|
|
56
|
+
self._activation_timestamp = time.monotonic()
|
|
57
|
+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
|
|
58
|
+
await new_timeout.__aenter__()
|
|
59
|
+
return self._remaining_timeout
|
|
60
|
+
|
|
61
|
+
async def __aexit__(
|
|
62
|
+
self,
|
|
63
|
+
exc_type: type[BaseException] | None,
|
|
64
|
+
exc_value: BaseException | None,
|
|
65
|
+
exc_traceback: TracebackType | None,
|
|
66
|
+
) -> None:
|
|
67
|
+
if self._active_timeout is None or self._activation_timestamp is None:
|
|
68
|
+
raise RuntimeError('Logic error')
|
|
69
|
+
|
|
70
|
+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
|
|
71
|
+
elapsed = time.monotonic() - self._activation_timestamp
|
|
72
|
+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
|
|
73
|
+
|
|
74
|
+
self._active_timeout = None
|
|
75
|
+
self._activation_timestamp = None
|
|
76
|
+
|
|
77
|
+
|
|
38
78
|
def format_duration(duration: timedelta | None) -> str:
|
|
39
79
|
"""Format a timedelta into a human-readable string with appropriate units."""
|
|
40
80
|
if duration is None:
|
crawlee/_utils/urls.py
CHANGED
|
@@ -7,6 +7,7 @@ from yarl import URL
|
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from collections.abc import Iterator
|
|
10
|
+
from logging import Logger
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def is_url_absolute(url: str) -> bool:
|
|
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
|
|
|
22
23
|
return str(URL(base_url).join(URL(relative_url)))
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
|
|
26
|
+
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
|
|
26
27
|
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
|
|
27
28
|
for url in urls:
|
|
28
29
|
if is_url_absolute(url):
|
|
29
30
|
yield url
|
|
30
31
|
else:
|
|
31
|
-
|
|
32
|
+
converted_url = convert_to_absolute_url(base_url, url)
|
|
33
|
+
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
|
|
34
|
+
if not is_url_absolute(converted_url):
|
|
35
|
+
if logger:
|
|
36
|
+
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
|
|
37
|
+
continue
|
|
38
|
+
yield converted_url
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
_http_url_adapter = TypeAdapter(AnyHttpUrl)
|
|
@@ -118,7 +118,10 @@ class BrowserPool:
|
|
|
118
118
|
"""Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
|
|
119
119
|
|
|
120
120
|
Args:
|
|
121
|
-
browser_type: The type of browser to launch
|
|
121
|
+
browser_type: The type of browser to launch:
|
|
122
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
123
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
124
|
+
the system.
|
|
122
125
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
123
126
|
and local storage.
|
|
124
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -135,7 +138,7 @@ class BrowserPool:
|
|
|
135
138
|
kwargs: Additional arguments for default constructor.
|
|
136
139
|
"""
|
|
137
140
|
plugin_options: dict = defaultdict(dict)
|
|
138
|
-
plugin_options['browser_launch_options'] = browser_launch_options
|
|
141
|
+
plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
|
|
139
142
|
plugin_options['browser_new_context_options'] = browser_new_context_options or {}
|
|
140
143
|
|
|
141
144
|
if headless is not None:
|
|
@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
|
|
|
78
78
|
|
|
79
79
|
async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
|
|
80
80
|
if self._temp_dir and self._temp_dir.exists():
|
|
81
|
-
|
|
81
|
+
temp_dir = self._temp_dir
|
|
82
|
+
await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
|
|
82
83
|
|
|
83
84
|
@override
|
|
84
85
|
async def close(self, **kwargs: Any) -> None:
|
|
@@ -216,7 +216,7 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
216
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
217
217
|
if proxy_info:
|
|
218
218
|
if browser_new_context_options.get('proxy'):
|
|
219
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
220
220
|
|
|
221
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
222
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
crawlee/configuration.py
CHANGED
|
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
33
|
+
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
34
36
|
"""Timeout for the internal asynchronous operations."""
|
crawlee/crawlers/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
2
|
from crawlee._utils.try_import import try_import as _try_import
|
|
3
3
|
|
|
4
|
-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
|
|
4
|
+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
|
|
5
5
|
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
|
|
6
6
|
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
|
|
7
7
|
|
|
@@ -23,12 +23,14 @@ with _try_import(
|
|
|
23
23
|
'AdaptivePlaywrightCrawler',
|
|
24
24
|
'AdaptivePlaywrightCrawlingContext',
|
|
25
25
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
26
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
26
27
|
'RenderingType',
|
|
27
28
|
'RenderingTypePrediction',
|
|
28
29
|
'RenderingTypePredictor',
|
|
29
30
|
):
|
|
30
31
|
from ._adaptive_playwright import (
|
|
31
32
|
AdaptivePlaywrightCrawler,
|
|
33
|
+
AdaptivePlaywrightCrawlerStatisticState,
|
|
32
34
|
AdaptivePlaywrightCrawlingContext,
|
|
33
35
|
AdaptivePlaywrightPreNavCrawlingContext,
|
|
34
36
|
RenderingType,
|
|
@@ -41,6 +43,7 @@ __all__ = [
|
|
|
41
43
|
'AbstractHttpCrawler',
|
|
42
44
|
'AbstractHttpParser',
|
|
43
45
|
'AdaptivePlaywrightCrawler',
|
|
46
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
44
47
|
'AdaptivePlaywrightCrawlingContext',
|
|
45
48
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
46
49
|
'BasicCrawler',
|
|
@@ -51,6 +54,7 @@ __all__ = [
|
|
|
51
54
|
'BeautifulSoupParserType',
|
|
52
55
|
'ContextPipeline',
|
|
53
56
|
'HttpCrawler',
|
|
57
|
+
'HttpCrawlerOptions',
|
|
54
58
|
'HttpCrawlingContext',
|
|
55
59
|
'HttpCrawlingResult',
|
|
56
60
|
'ParsedHttpCrawlingContext',
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from ._abstract_http_crawler import AbstractHttpCrawler
|
|
1
|
+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
|
|
2
2
|
from ._abstract_http_parser import AbstractHttpParser
|
|
3
3
|
from ._http_crawling_context import ParsedHttpCrawlingContext
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
'AbstractHttpCrawler',
|
|
7
7
|
'AbstractHttpParser',
|
|
8
|
+
'HttpCrawlerOptions',
|
|
8
9
|
'ParsedHttpCrawlingContext',
|
|
9
10
|
]
|
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
|
-
from crawlee._request import Request, RequestOptions
|
|
13
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
|
|
|
82
100
|
this method simplifies cases where `TParseResult` is used for both generic parameters.
|
|
83
101
|
"""
|
|
84
102
|
|
|
85
|
-
class _ParsedHttpCrawler(
|
|
86
|
-
AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
|
|
87
|
-
):
|
|
103
|
+
class _ParsedHttpCrawler(AbstractHttpCrawler):
|
|
88
104
|
def __init__(
|
|
89
105
|
self,
|
|
90
106
|
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
|
|
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
|
|
|
112
128
|
async def _execute_pre_navigation_hooks(
|
|
113
129
|
self, context: BasicCrawlingContext
|
|
114
130
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
131
|
+
context_id = id(context)
|
|
132
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
for hook in self._pre_navigation_hooks:
|
|
136
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
137
|
+
await hook(context)
|
|
138
|
+
|
|
139
|
+
yield context
|
|
140
|
+
finally:
|
|
141
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
142
|
|
|
119
143
|
async def _parse_http_response(
|
|
120
144
|
self, context: HttpCrawlingContext
|
|
@@ -165,9 +189,18 @@ class AbstractHttpCrawler(
|
|
|
165
189
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
166
190
|
|
|
167
191
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
192
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
168
193
|
|
|
169
194
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
195
|
+
|
|
196
|
+
# Get base URL from <base> tag if present
|
|
197
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
198
|
+
base_url: str = (
|
|
199
|
+
str(extracted_base_urls[0])
|
|
200
|
+
if extracted_base_urls
|
|
201
|
+
else context.request.loaded_url or context.request.url
|
|
202
|
+
)
|
|
203
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
171
204
|
|
|
172
205
|
if robots_txt_file:
|
|
173
206
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -175,7 +208,9 @@ class AbstractHttpCrawler(
|
|
|
175
208
|
skipped = iter([])
|
|
176
209
|
|
|
177
210
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
178
|
-
request_options = RequestOptions(
|
|
211
|
+
request_options = RequestOptions(
|
|
212
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
213
|
+
)
|
|
179
214
|
|
|
180
215
|
if transform_request_function:
|
|
181
216
|
transform_request_options = transform_request_function(request_options)
|
|
@@ -214,13 +249,16 @@ class AbstractHttpCrawler(
|
|
|
214
249
|
Yields:
|
|
215
250
|
The original crawling context enhanced by HTTP response.
|
|
216
251
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
252
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
253
|
+
result = await self._http_client.crawl(
|
|
254
|
+
request=context.request,
|
|
255
|
+
session=context.session,
|
|
256
|
+
proxy_info=context.proxy_info,
|
|
257
|
+
statistics=self._statistics,
|
|
258
|
+
timeout=remaining_timeout,
|
|
259
|
+
)
|
|
223
260
|
|
|
261
|
+
context.request.state = RequestState.AFTER_NAV
|
|
224
262
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
225
263
|
|
|
226
264
|
async def _handle_status_code_response(
|
|
@@ -11,13 +11,16 @@ _install_import_hook(__name__)
|
|
|
11
11
|
|
|
12
12
|
# The following imports are wrapped in try_import to handle optional dependencies,
|
|
13
13
|
# ensuring the module can still function even if these dependencies are missing.
|
|
14
|
-
with _try_import(__name__, '
|
|
14
|
+
with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
|
|
15
15
|
from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
|
|
16
|
-
with _try_import(__name__, '
|
|
16
|
+
with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
|
|
17
17
|
from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
|
|
18
|
+
with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
|
|
19
|
+
from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
|
|
18
20
|
|
|
19
21
|
__all__ = [
|
|
20
22
|
'AdaptivePlaywrightCrawler',
|
|
23
|
+
'AdaptivePlaywrightCrawlerStatisticState',
|
|
21
24
|
'AdaptivePlaywrightCrawlingContext',
|
|
22
25
|
'AdaptivePlaywrightPreNavCrawlingContext',
|
|
23
26
|
'RenderingType',
|
|
@@ -27,23 +27,16 @@ from crawlee.crawlers import (
|
|
|
27
27
|
)
|
|
28
28
|
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
|
|
29
29
|
from crawlee.crawlers._parsel._parsel_parser import ParselParser
|
|
30
|
+
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
|
|
30
31
|
from crawlee.statistics import Statistics, StatisticsState
|
|
31
32
|
|
|
32
|
-
from ._adaptive_playwright_crawler_statistics import
|
|
33
|
-
AdaptivePlaywrightCrawlerStatisticState,
|
|
34
|
-
)
|
|
33
|
+
from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
|
|
35
34
|
from ._adaptive_playwright_crawling_context import (
|
|
36
35
|
AdaptivePlaywrightCrawlingContext,
|
|
37
36
|
AdaptivePlaywrightPreNavCrawlingContext,
|
|
38
37
|
)
|
|
39
|
-
from ._rendering_type_predictor import
|
|
40
|
-
|
|
41
|
-
RenderingType,
|
|
42
|
-
RenderingTypePredictor,
|
|
43
|
-
)
|
|
44
|
-
from ._result_comparator import (
|
|
45
|
-
create_default_comparator,
|
|
46
|
-
)
|
|
38
|
+
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
|
|
39
|
+
from ._result_comparator import create_default_comparator
|
|
47
40
|
|
|
48
41
|
if TYPE_CHECKING:
|
|
49
42
|
from types import TracebackType
|
|
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
|
|
|
51
44
|
from typing_extensions import Unpack
|
|
52
45
|
|
|
53
46
|
from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
|
|
54
|
-
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
|
|
55
47
|
|
|
56
48
|
|
|
57
49
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -71,7 +63,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
63
|
async def __aenter__(self) -> Self:
|
|
72
64
|
self._active = True
|
|
73
65
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
66
|
return self
|
|
76
67
|
|
|
77
68
|
async def __aexit__(
|
|
@@ -149,10 +140,6 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
140
|
non-default configuration.
|
|
150
141
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
142
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
143
|
# Adaptive crawling related.
|
|
157
144
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
145
|
self.result_checker = result_checker or (lambda _: True)
|
|
@@ -162,19 +149,21 @@ class AdaptivePlaywrightCrawler(
|
|
|
162
149
|
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
163
150
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
164
151
|
|
|
165
|
-
|
|
152
|
+
adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
|
|
153
|
+
|
|
154
|
+
super().__init__(statistics=adaptive_statistics, **kwargs)
|
|
166
155
|
|
|
167
156
|
# Sub crawlers related.
|
|
168
|
-
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or
|
|
157
|
+
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
|
|
169
158
|
|
|
170
159
|
# Each sub crawler will use custom logger .
|
|
171
160
|
static_logger = getLogger('Subcrawler_static')
|
|
172
161
|
static_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_static_crawler
|
|
162
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
174
163
|
|
|
175
164
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
176
165
|
pw_logger.setLevel(logging.ERROR)
|
|
177
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
166
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
178
167
|
|
|
179
168
|
# Initialize sub crawlers to create their pipelines.
|
|
180
169
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -295,11 +284,14 @@ class AdaptivePlaywrightCrawler(
|
|
|
295
284
|
use_state_function = context.use_state
|
|
296
285
|
|
|
297
286
|
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
|
|
298
|
-
result = RequestHandlerRunResult(
|
|
287
|
+
result = RequestHandlerRunResult(
|
|
288
|
+
key_value_store_getter=self.get_key_value_store,
|
|
289
|
+
request=context.request,
|
|
290
|
+
)
|
|
299
291
|
context_linked_to_result = BasicCrawlingContext(
|
|
300
|
-
request=
|
|
301
|
-
session=
|
|
302
|
-
proxy_info=
|
|
292
|
+
request=result.request,
|
|
293
|
+
session=context.session,
|
|
294
|
+
proxy_info=context.proxy_info,
|
|
303
295
|
send_request=context.send_request,
|
|
304
296
|
add_requests=result.add_requests,
|
|
305
297
|
push_data=result.push_data,
|
|
@@ -337,7 +329,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
337
329
|
)
|
|
338
330
|
await self.router(adaptive_crawling_context)
|
|
339
331
|
|
|
340
|
-
return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
|
|
332
|
+
return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
|
|
341
333
|
|
|
342
334
|
if rendering_type == 'client only':
|
|
343
335
|
|
|
@@ -347,7 +339,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
347
339
|
)
|
|
348
340
|
await self.router(adaptive_crawling_context)
|
|
349
341
|
|
|
350
|
-
return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
|
|
342
|
+
return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
|
|
351
343
|
|
|
352
344
|
raise RuntimeError(
|
|
353
345
|
f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
|
|
@@ -407,12 +399,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
407
399
|
raise pw_run.exception
|
|
408
400
|
|
|
409
401
|
if pw_run.result:
|
|
410
|
-
self._context_result_map[context] = pw_run.result
|
|
411
|
-
|
|
412
402
|
if should_detect_rendering_type:
|
|
413
403
|
detection_result: RenderingType
|
|
414
404
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
415
|
-
|
|
416
405
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
417
406
|
detection_result = 'static'
|
|
418
407
|
else:
|
|
@@ -421,6 +410,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
421
410
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
422
411
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
423
412
|
|
|
413
|
+
self._context_result_map[context] = pw_run.result
|
|
414
|
+
|
|
424
415
|
def pre_navigation_hook(
|
|
425
416
|
self,
|
|
426
417
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|