crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +34 -22
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +86 -33
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +124 -37
- crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
- crawlee/events/_event_manager.py +3 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +33 -2
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
- crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +10 -2
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/robots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from protego import Protego
|
|
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
|
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
class RobotsTxtFile:
|
|
19
23
|
def __init__(
|
|
20
24
|
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
|
|
@@ -56,12 +60,20 @@ class RobotsTxtFile:
|
|
|
56
60
|
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
|
|
57
61
|
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
|
|
58
62
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = await http_client.send_request(url, proxy_info=proxy_info)
|
|
65
|
+
|
|
66
|
+
body = (
|
|
67
|
+
b'User-agent: *\nAllow: /'
|
|
68
|
+
if is_status_code_client_error(response.status_code)
|
|
69
|
+
else await response.read()
|
|
70
|
+
)
|
|
71
|
+
robots = Protego.parse(body.decode('utf-8'))
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
|
|
63
75
|
|
|
64
|
-
|
|
76
|
+
robots = Protego.parse('User-agent: *\nAllow: /')
|
|
65
77
|
|
|
66
78
|
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
|
|
67
79
|
|
crawlee/_utils/sitemap.py
CHANGED
|
@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
|
|
|
335
335
|
# Check if the first chunk is a valid gzip header
|
|
336
336
|
if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
|
|
337
337
|
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
|
|
338
|
-
|
|
338
|
+
first_chunk = False
|
|
339
339
|
|
|
340
340
|
chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
|
|
341
341
|
text_chunk = decoder.decode(chunk)
|
crawlee/_utils/system.py
CHANGED
|
@@ -36,7 +36,7 @@ else:
|
|
|
36
36
|
class CpuInfo(BaseModel):
|
|
37
37
|
"""Information about the CPU usage."""
|
|
38
38
|
|
|
39
|
-
model_config = ConfigDict(
|
|
39
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
40
40
|
|
|
41
41
|
used_ratio: Annotated[float, Field(alias='usedRatio')]
|
|
42
42
|
"""The ratio of CPU currently in use, represented as a float between 0 and 1."""
|
|
@@ -51,7 +51,7 @@ class CpuInfo(BaseModel):
|
|
|
51
51
|
class MemoryUsageInfo(BaseModel):
|
|
52
52
|
"""Information about the memory usage."""
|
|
53
53
|
|
|
54
|
-
model_config = ConfigDict(
|
|
54
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
55
55
|
|
|
56
56
|
current_size: Annotated[
|
|
57
57
|
ByteSize,
|
|
@@ -71,7 +71,7 @@ class MemoryUsageInfo(BaseModel):
|
|
|
71
71
|
class MemoryInfo(MemoryUsageInfo):
|
|
72
72
|
"""Information about system memory."""
|
|
73
73
|
|
|
74
|
-
model_config = ConfigDict(
|
|
74
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
75
75
|
|
|
76
76
|
total_size: Annotated[
|
|
77
77
|
ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
|
crawlee/_utils/urls.py
CHANGED
|
@@ -7,6 +7,7 @@ from yarl import URL
|
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from collections.abc import Iterator
|
|
10
|
+
from logging import Logger
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def is_url_absolute(url: str) -> bool:
|
|
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
|
|
|
22
23
|
return str(URL(base_url).join(URL(relative_url)))
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
|
|
26
|
+
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
|
|
26
27
|
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
|
|
27
28
|
for url in urls:
|
|
28
29
|
if is_url_absolute(url):
|
|
29
30
|
yield url
|
|
30
31
|
else:
|
|
31
|
-
|
|
32
|
+
converted_url = convert_to_absolute_url(base_url, url)
|
|
33
|
+
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
|
|
34
|
+
if not is_url_absolute(converted_url):
|
|
35
|
+
if logger:
|
|
36
|
+
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
|
|
37
|
+
continue
|
|
38
|
+
yield converted_url
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
_http_url_adapter = TypeAdapter(AnyHttpUrl)
|
|
@@ -118,7 +118,10 @@ class BrowserPool:
|
|
|
118
118
|
"""Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
|
|
119
119
|
|
|
120
120
|
Args:
|
|
121
|
-
browser_type: The type of browser to launch
|
|
121
|
+
browser_type: The type of browser to launch:
|
|
122
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
123
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
124
|
+
the system.
|
|
122
125
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
123
126
|
and local storage.
|
|
124
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from asyncio import Lock
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
77
78
|
|
|
78
79
|
self._total_opened_pages = 0
|
|
79
80
|
|
|
81
|
+
self._context_creation_lock: Lock | None = None
|
|
82
|
+
|
|
83
|
+
async def _get_context_creation_lock(self) -> Lock:
|
|
84
|
+
"""Get context checking and creation lock.
|
|
85
|
+
|
|
86
|
+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
|
|
87
|
+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
|
|
88
|
+
"""
|
|
89
|
+
if self._context_creation_lock:
|
|
90
|
+
return self._context_creation_lock
|
|
91
|
+
self._context_creation_lock = Lock()
|
|
92
|
+
return self._context_creation_lock
|
|
93
|
+
|
|
80
94
|
@property
|
|
81
95
|
@override
|
|
82
96
|
def pages(self) -> list[Page]:
|
|
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
137
151
|
Raises:
|
|
138
152
|
ValueError: If the browser has reached the maximum number of open pages.
|
|
139
153
|
"""
|
|
140
|
-
if not self._browser_context:
|
|
141
|
-
self._browser_context = await self._create_browser_context(
|
|
142
|
-
browser_new_context_options=browser_new_context_options,
|
|
143
|
-
proxy_info=proxy_info,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
154
|
if not self.has_free_capacity:
|
|
147
155
|
raise ValueError('Cannot open more pages in this browser.')
|
|
148
156
|
|
|
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
154
162
|
)
|
|
155
163
|
page = await new_context.new_page()
|
|
156
164
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async with await self._get_context_creation_lock():
|
|
166
|
+
if not self._browser_context:
|
|
167
|
+
self._browser_context = await self._create_browser_context(
|
|
168
|
+
browser_new_context_options=browser_new_context_options,
|
|
169
|
+
proxy_info=proxy_info,
|
|
170
|
+
)
|
|
162
171
|
page = await self._browser_context.new_page()
|
|
163
172
|
|
|
164
173
|
# Handle page close event
|
|
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
169
178
|
self._last_page_opened_at = datetime.now(timezone.utc)
|
|
170
179
|
|
|
171
180
|
self._total_opened_pages += 1
|
|
172
|
-
|
|
173
181
|
return page
|
|
174
182
|
|
|
175
183
|
@override
|
|
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
206
214
|
`self._fingerprint_generator` is available.
|
|
207
215
|
"""
|
|
208
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
209
|
-
|
|
210
217
|
if proxy_info:
|
|
211
218
|
if browser_new_context_options.get('proxy'):
|
|
212
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
213
220
|
|
|
214
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
215
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
244
251
|
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
|
|
245
252
|
'extra_http_headers', extra_http_headers
|
|
246
253
|
)
|
|
247
|
-
|
|
248
254
|
return await self._browser.new_context(**browser_new_context_options)
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
crawlee/configuration.py
CHANGED
|
@@ -28,6 +28,8 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
31
33
|
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
@@ -34,7 +34,9 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=St
|
|
|
34
34
|
|
|
35
35
|
@docs_group('Crawlers')
|
|
36
36
|
class AbstractHttpCrawler(
|
|
37
|
-
|
|
37
|
+
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
38
|
+
ABC,
|
|
39
|
+
Generic[TCrawlingContext, TParseResult, TSelectResult],
|
|
38
40
|
):
|
|
39
41
|
"""A web crawler for performing HTTP requests.
|
|
40
42
|
|
|
@@ -165,7 +167,9 @@ class AbstractHttpCrawler(
|
|
|
165
167
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
166
168
|
|
|
167
169
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
168
|
-
links_iterator = to_absolute_url_iterator(
|
|
170
|
+
links_iterator = to_absolute_url_iterator(
|
|
171
|
+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
|
|
172
|
+
)
|
|
169
173
|
|
|
170
174
|
if robots_txt_file:
|
|
171
175
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@docs_group('HTTP parsers')
|
|
19
|
-
class AbstractHttpParser(Generic[TParseResult, TSelectResult]
|
|
19
|
+
class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
|
|
20
20
|
"""Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
|
|
21
21
|
|
|
22
22
|
@abstractmethod
|
|
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
|
|
|
31
31
|
|
|
32
32
|
@dataclass(frozen=True)
|
|
33
33
|
@docs_group('Crawling contexts')
|
|
34
|
-
class ParsedHttpCrawlingContext(Generic[TParseResult]
|
|
34
|
+
class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
|
|
35
35
|
"""The crawling context used by `AbstractHttpCrawler`.
|
|
36
36
|
|
|
37
37
|
It provides access to key objects as well as utility functions for handling crawling tasks.
|
|
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|
|
12
12
|
from parsel import Selector
|
|
13
13
|
from typing_extensions import Self, TypeVar, override
|
|
14
14
|
|
|
15
|
-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
|
|
15
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
|
|
16
16
|
from crawlee._utils.docs import docs_group
|
|
17
17
|
from crawlee._utils.wait import wait_for
|
|
18
18
|
from crawlee.crawlers import (
|
|
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
71
|
async def __aenter__(self) -> Self:
|
|
72
72
|
self._active = True
|
|
73
73
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
74
|
return self
|
|
76
75
|
|
|
77
76
|
async def __aexit__(
|
|
@@ -85,8 +84,8 @@ class _NonPersistentStatistics(Statistics):
|
|
|
85
84
|
|
|
86
85
|
@docs_group('Crawlers')
|
|
87
86
|
class AdaptivePlaywrightCrawler(
|
|
88
|
-
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
89
87
|
BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
|
|
88
|
+
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
90
89
|
):
|
|
91
90
|
"""An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
|
|
92
91
|
|
|
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
148
|
non-default configuration.
|
|
150
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
150
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
151
|
# Adaptive crawling related.
|
|
157
152
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
153
|
self.result_checker = result_checker or (lambda _: True)
|
|
159
154
|
self.result_comparator = result_comparator or create_default_comparator(result_checker)
|
|
160
155
|
|
|
156
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
157
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
158
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
159
|
+
|
|
161
160
|
super().__init__(statistics=statistics, **kwargs)
|
|
162
161
|
|
|
163
162
|
# Sub crawlers related.
|
|
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
|
|
|
166
165
|
# Each sub crawler will use custom logger .
|
|
167
166
|
static_logger = getLogger('Subcrawler_static')
|
|
168
167
|
static_logger.setLevel(logging.ERROR)
|
|
169
|
-
basic_crawler_kwargs_for_static_crawler
|
|
168
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
170
169
|
|
|
171
170
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
172
171
|
pw_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
172
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
174
173
|
|
|
175
174
|
# Initialize sub crawlers to create their pipelines.
|
|
176
175
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
315
314
|
),
|
|
316
315
|
logger=self._logger,
|
|
317
316
|
)
|
|
318
|
-
return SubCrawlerRun(result=result)
|
|
317
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
319
318
|
except Exception as e:
|
|
320
319
|
return SubCrawlerRun(exception=e)
|
|
321
320
|
|
|
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
371
370
|
self.track_http_only_request_handler_runs()
|
|
372
371
|
|
|
373
372
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
374
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
373
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
374
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
375
375
|
self._context_result_map[context] = static_run.result
|
|
376
376
|
return
|
|
377
377
|
if static_run.exception:
|
|
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
402
|
if pw_run.exception is not None:
|
|
403
403
|
raise pw_run.exception
|
|
404
404
|
|
|
405
|
-
if pw_run.result:
|
|
406
|
-
self._context_result_map[context] = pw_run.result
|
|
407
|
-
|
|
405
|
+
if pw_run.result and pw_run.run_context:
|
|
408
406
|
if should_detect_rendering_type:
|
|
409
407
|
detection_result: RenderingType
|
|
410
408
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
411
|
-
|
|
412
409
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
413
410
|
detection_result = 'static'
|
|
414
411
|
else:
|
|
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
417
414
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
418
415
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
419
416
|
|
|
417
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
|
+
self._context_result_map[context] = pw_run.result
|
|
419
|
+
|
|
420
420
|
def pre_navigation_hook(
|
|
421
421
|
self,
|
|
422
422
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
451
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
452
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
453
|
|
|
454
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
+
|
|
457
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
+
allowing state synchronization after isolated crawler execution.
|
|
459
|
+
"""
|
|
460
|
+
updating_attributes = {
|
|
461
|
+
'request': ('headers', 'user_data'),
|
|
462
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
+
original_sub_obj = getattr(context, attr)
|
|
467
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
+
|
|
469
|
+
# Check that both sub objects are not None
|
|
470
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
for sub_attr in sub_attrs:
|
|
474
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
+
|
|
454
477
|
|
|
455
478
|
@dataclass(frozen=True)
|
|
456
479
|
class SubCrawlerRun:
|
|
457
480
|
result: RequestHandlerRunResult | None = None
|
|
458
481
|
exception: Exception | None = None
|
|
482
|
+
run_context: BasicCrawlingContext | None = None
|
|
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
|
|
|
12
12
|
class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
|
|
13
13
|
"""Statistic data about a crawler run with additional information related to adaptive crawling."""
|
|
14
14
|
|
|
15
|
-
model_config = ConfigDict(
|
|
15
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
|
|
16
16
|
|
|
17
17
|
http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
|
|
18
18
|
"""Number representing how many times static http based crawling was used."""
|
|
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
|
|
|
31
31
|
@dataclass(frozen=True)
|
|
32
32
|
@docs_group('Crawling contexts')
|
|
33
33
|
class AdaptivePlaywrightCrawlingContext(
|
|
34
|
-
|
|
34
|
+
ParsedHttpCrawlingContext[TStaticParseResult],
|
|
35
|
+
Generic[TStaticParseResult, TStaticSelectResult],
|
|
35
36
|
):
|
|
36
37
|
_static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
|
|
37
38
|
"""The crawling context used by `AdaptivePlaywrightCrawler`.
|
|
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class RenderingTypePredictorState(BaseModel):
|
|
35
|
-
model_config = ConfigDict(
|
|
35
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
36
36
|
|
|
37
37
|
model: Annotated[
|
|
38
38
|
LogisticRegression,
|