crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +87 -25
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +139 -96
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from asyncio import Lock
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
77
78
|
|
|
78
79
|
self._total_opened_pages = 0
|
|
79
80
|
|
|
81
|
+
self._context_creation_lock: Lock | None = None
|
|
82
|
+
|
|
83
|
+
async def _get_context_creation_lock(self) -> Lock:
|
|
84
|
+
"""Get context checking and creation lock.
|
|
85
|
+
|
|
86
|
+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
|
|
87
|
+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
|
|
88
|
+
"""
|
|
89
|
+
if self._context_creation_lock:
|
|
90
|
+
return self._context_creation_lock
|
|
91
|
+
self._context_creation_lock = Lock()
|
|
92
|
+
return self._context_creation_lock
|
|
93
|
+
|
|
80
94
|
@property
|
|
81
95
|
@override
|
|
82
96
|
def pages(self) -> list[Page]:
|
|
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
137
151
|
Raises:
|
|
138
152
|
ValueError: If the browser has reached the maximum number of open pages.
|
|
139
153
|
"""
|
|
140
|
-
if not self._browser_context:
|
|
141
|
-
self._browser_context = await self._create_browser_context(
|
|
142
|
-
browser_new_context_options=browser_new_context_options,
|
|
143
|
-
proxy_info=proxy_info,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
154
|
if not self.has_free_capacity:
|
|
147
155
|
raise ValueError('Cannot open more pages in this browser.')
|
|
148
156
|
|
|
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
154
162
|
)
|
|
155
163
|
page = await new_context.new_page()
|
|
156
164
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async with await self._get_context_creation_lock():
|
|
166
|
+
if not self._browser_context:
|
|
167
|
+
self._browser_context = await self._create_browser_context(
|
|
168
|
+
browser_new_context_options=browser_new_context_options,
|
|
169
|
+
proxy_info=proxy_info,
|
|
170
|
+
)
|
|
162
171
|
page = await self._browser_context.new_page()
|
|
163
172
|
|
|
164
173
|
# Handle page close event
|
|
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
169
178
|
self._last_page_opened_at = datetime.now(timezone.utc)
|
|
170
179
|
|
|
171
180
|
self._total_opened_pages += 1
|
|
172
|
-
|
|
173
181
|
return page
|
|
174
182
|
|
|
175
183
|
@override
|
|
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
206
214
|
`self._fingerprint_generator` is available.
|
|
207
215
|
"""
|
|
208
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
209
|
-
|
|
210
217
|
if proxy_info:
|
|
211
218
|
if browser_new_context_options.get('proxy'):
|
|
212
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
213
220
|
|
|
214
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
215
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
244
251
|
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
|
|
245
252
|
'extra_http_headers', extra_http_headers
|
|
246
253
|
)
|
|
247
|
-
|
|
248
254
|
return await self._browser.new_context(**browser_new_context_options)
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
crawlee/configuration.py
CHANGED
|
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
33
|
+
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
34
36
|
"""Timeout for the internal asynchronous operations."""
|
crawlee/crawlers/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
2
|
from crawlee._utils.try_import import try_import as _try_import
|
|
3
3
|
|
|
4
|
-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
|
|
4
|
+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
|
|
5
5
|
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
|
|
6
6
|
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
|
|
7
7
|
|
|
@@ -51,6 +51,7 @@ __all__ = [
|
|
|
51
51
|
'BeautifulSoupParserType',
|
|
52
52
|
'ContextPipeline',
|
|
53
53
|
'HttpCrawler',
|
|
54
|
+
'HttpCrawlerOptions',
|
|
54
55
|
'HttpCrawlingContext',
|
|
55
56
|
'HttpCrawlingResult',
|
|
56
57
|
'ParsedHttpCrawlingContext',
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from ._abstract_http_crawler import AbstractHttpCrawler
|
|
1
|
+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
|
|
2
2
|
from ._abstract_http_parser import AbstractHttpParser
|
|
3
3
|
from ._http_crawling_context import ParsedHttpCrawlingContext
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
'AbstractHttpCrawler',
|
|
7
7
|
'AbstractHttpParser',
|
|
8
|
+
'HttpCrawlerOptions',
|
|
8
9
|
'ParsedHttpCrawlingContext',
|
|
9
10
|
]
|
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
from abc import ABC
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from typing import TYPE_CHECKING, Any, Generic
|
|
7
8
|
|
|
8
9
|
from more_itertools import partition
|
|
9
10
|
from pydantic import ValidationError
|
|
10
|
-
from typing_extensions import TypeVar
|
|
11
|
+
from typing_extensions import NotRequired, TypeVar
|
|
11
12
|
|
|
12
13
|
from crawlee._request import Request, RequestOptions
|
|
13
14
|
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee._utils.time import SharedTimeout
|
|
14
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
15
17
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
16
18
|
from crawlee.errors import SessionError
|
|
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
|
|
|
32
34
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
33
35
|
|
|
34
36
|
|
|
37
|
+
class HttpCrawlerOptions(
|
|
38
|
+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
|
|
39
|
+
Generic[TCrawlingContext, TStatisticsState],
|
|
40
|
+
):
|
|
41
|
+
"""Arguments for the `AbstractHttpCrawler` constructor.
|
|
42
|
+
|
|
43
|
+
It is intended for typing forwarded `__init__` arguments in the subclasses.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
navigation_timeout: NotRequired[timedelta | None]
|
|
47
|
+
"""Timeout for the HTTP request."""
|
|
48
|
+
|
|
49
|
+
|
|
35
50
|
@docs_group('Crawlers')
|
|
36
51
|
class AbstractHttpCrawler(
|
|
37
52
|
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
|
|
|
56
71
|
self,
|
|
57
72
|
*,
|
|
58
73
|
parser: AbstractHttpParser[TParseResult, TSelectResult],
|
|
74
|
+
navigation_timeout: timedelta | None = None,
|
|
59
75
|
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
|
|
60
76
|
) -> None:
|
|
61
77
|
self._parser = parser
|
|
78
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
62
79
|
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
|
|
80
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
63
81
|
|
|
64
82
|
if '_context_pipeline' not in kwargs:
|
|
65
83
|
raise ValueError(
|
|
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
|
|
|
112
130
|
async def _execute_pre_navigation_hooks(
|
|
113
131
|
self, context: BasicCrawlingContext
|
|
114
132
|
) -> AsyncGenerator[BasicCrawlingContext, None]:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
133
|
+
context_id = id(context)
|
|
134
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
for hook in self._pre_navigation_hooks:
|
|
138
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
139
|
+
await hook(context)
|
|
140
|
+
|
|
141
|
+
yield context
|
|
142
|
+
finally:
|
|
143
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
118
144
|
|
|
119
145
|
async def _parse_http_response(
|
|
120
146
|
self, context: HttpCrawlingContext
|
|
@@ -167,7 +193,15 @@ class AbstractHttpCrawler(
|
|
|
167
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
168
194
|
|
|
169
195
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
|
|
196
|
+
|
|
197
|
+
# Get base URL from <base> tag if present
|
|
198
|
+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
|
|
199
|
+
base_url: str = (
|
|
200
|
+
str(extracted_base_urls[0])
|
|
201
|
+
if extracted_base_urls
|
|
202
|
+
else context.request.loaded_url or context.request.url
|
|
203
|
+
)
|
|
204
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
171
205
|
|
|
172
206
|
if robots_txt_file:
|
|
173
207
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -214,12 +248,14 @@ class AbstractHttpCrawler(
|
|
|
214
248
|
Yields:
|
|
215
249
|
The original crawling context enhanced by HTTP response.
|
|
216
250
|
"""
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
251
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
252
|
+
result = await self._http_client.crawl(
|
|
253
|
+
request=context.request,
|
|
254
|
+
session=context.session,
|
|
255
|
+
proxy_info=context.proxy_info,
|
|
256
|
+
statistics=self._statistics,
|
|
257
|
+
timeout=remaining_timeout,
|
|
258
|
+
)
|
|
223
259
|
|
|
224
260
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
225
261
|
|
|
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|
|
12
12
|
from parsel import Selector
|
|
13
13
|
from typing_extensions import Self, TypeVar, override
|
|
14
14
|
|
|
15
|
-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
|
|
15
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
|
|
16
16
|
from crawlee._utils.docs import docs_group
|
|
17
17
|
from crawlee._utils.wait import wait_for
|
|
18
18
|
from crawlee.crawlers import (
|
|
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
71
|
async def __aenter__(self) -> Self:
|
|
72
72
|
self._active = True
|
|
73
73
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
74
|
return self
|
|
76
75
|
|
|
77
76
|
async def __aexit__(
|
|
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
148
|
non-default configuration.
|
|
150
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
150
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
151
|
# Adaptive crawling related.
|
|
157
152
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
153
|
self.result_checker = result_checker or (lambda _: True)
|
|
159
154
|
self.result_comparator = result_comparator or create_default_comparator(result_checker)
|
|
160
155
|
|
|
156
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
157
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
158
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
159
|
+
|
|
161
160
|
super().__init__(statistics=statistics, **kwargs)
|
|
162
161
|
|
|
163
162
|
# Sub crawlers related.
|
|
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
|
|
|
166
165
|
# Each sub crawler will use custom logger .
|
|
167
166
|
static_logger = getLogger('Subcrawler_static')
|
|
168
167
|
static_logger.setLevel(logging.ERROR)
|
|
169
|
-
basic_crawler_kwargs_for_static_crawler
|
|
168
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
170
169
|
|
|
171
170
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
172
171
|
pw_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
172
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
174
173
|
|
|
175
174
|
# Initialize sub crawlers to create their pipelines.
|
|
176
175
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
315
314
|
),
|
|
316
315
|
logger=self._logger,
|
|
317
316
|
)
|
|
318
|
-
return SubCrawlerRun(result=result)
|
|
317
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
319
318
|
except Exception as e:
|
|
320
319
|
return SubCrawlerRun(exception=e)
|
|
321
320
|
|
|
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
371
370
|
self.track_http_only_request_handler_runs()
|
|
372
371
|
|
|
373
372
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
374
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
373
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
374
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
375
375
|
self._context_result_map[context] = static_run.result
|
|
376
376
|
return
|
|
377
377
|
if static_run.exception:
|
|
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
402
|
if pw_run.exception is not None:
|
|
403
403
|
raise pw_run.exception
|
|
404
404
|
|
|
405
|
-
if pw_run.result:
|
|
406
|
-
self._context_result_map[context] = pw_run.result
|
|
407
|
-
|
|
405
|
+
if pw_run.result and pw_run.run_context:
|
|
408
406
|
if should_detect_rendering_type:
|
|
409
407
|
detection_result: RenderingType
|
|
410
408
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
411
|
-
|
|
412
409
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
413
410
|
detection_result = 'static'
|
|
414
411
|
else:
|
|
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
417
414
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
418
415
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
419
416
|
|
|
417
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
|
+
self._context_result_map[context] = pw_run.result
|
|
419
|
+
|
|
420
420
|
def pre_navigation_hook(
|
|
421
421
|
self,
|
|
422
422
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
451
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
452
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
453
|
|
|
454
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
+
|
|
457
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
+
allowing state synchronization after isolated crawler execution.
|
|
459
|
+
"""
|
|
460
|
+
updating_attributes = {
|
|
461
|
+
'request': ('headers', 'user_data'),
|
|
462
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
+
original_sub_obj = getattr(context, attr)
|
|
467
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
+
|
|
469
|
+
# Check that both sub objects are not None
|
|
470
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
for sub_attr in sub_attrs:
|
|
474
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
+
|
|
454
477
|
|
|
455
478
|
@dataclass(frozen=True)
|
|
456
479
|
class SubCrawlerRun:
|
|
457
480
|
result: RequestHandlerRunResult | None = None
|
|
458
481
|
exception: Exception | None = None
|
|
482
|
+
run_context: BasicCrawlingContext | None = None
|