scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -7,22 +7,16 @@ from playwright.async_api import (
|
|
7
7
|
BrowserContext as AsyncBrowserContext,
|
8
8
|
Playwright as AsyncPlaywright,
|
9
9
|
)
|
10
|
-
from camoufox.
|
11
|
-
|
12
|
-
installed_verstr as camoufox_version,
|
13
|
-
)
|
10
|
+
from camoufox.pkgman import installed_verstr as camoufox_version
|
11
|
+
from camoufox.utils import launch_options as generate_launch_options
|
14
12
|
|
15
|
-
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
16
|
-
from scrapling.core._types import (
|
17
|
-
Any,
|
18
|
-
Dict,
|
19
|
-
Optional,
|
20
|
-
)
|
21
13
|
from ._page import PageInfo, PagePool
|
22
|
-
from .
|
23
|
-
from .
|
14
|
+
from scrapling.parser import Selector
|
15
|
+
from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
|
24
16
|
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
25
17
|
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
18
|
+
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
19
|
+
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
26
20
|
|
27
21
|
__ff_version_str__ = camoufox_version().split(".", 1)[0]
|
28
22
|
|
@@ -45,6 +39,7 @@ class SyncSession:
|
|
45
39
|
"""Get a new page to use"""
|
46
40
|
|
47
41
|
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
42
|
+
assert self.context is not None, "Browser context not initialized"
|
48
43
|
page = self.context.new_page()
|
49
44
|
page.set_default_navigation_timeout(timeout)
|
50
45
|
page.set_default_timeout(timeout)
|
@@ -69,11 +64,14 @@ class SyncSession:
|
|
69
64
|
}
|
70
65
|
|
71
66
|
|
72
|
-
class AsyncSession
|
67
|
+
class AsyncSession:
|
73
68
|
def __init__(self, max_pages: int = 1):
|
74
|
-
|
69
|
+
self.max_pages = max_pages
|
70
|
+
self.page_pool = PagePool(max_pages)
|
71
|
+
self._max_wait_for_page = 60
|
75
72
|
self.playwright: Optional[AsyncPlaywright] = None
|
76
73
|
self.context: Optional[AsyncBrowserContext] = None
|
74
|
+
self._closed = False
|
77
75
|
self._lock = Lock()
|
78
76
|
|
79
77
|
async def _get_page(
|
@@ -83,6 +81,9 @@ class AsyncSession(SyncSession):
|
|
83
81
|
disable_resources: bool,
|
84
82
|
) -> PageInfo: # pragma: no cover
|
85
83
|
"""Get a new page to use"""
|
84
|
+
if TYPE_CHECKING:
|
85
|
+
assert self.context is not None, "Browser context not initialized"
|
86
|
+
|
86
87
|
async with self._lock:
|
87
88
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
88
89
|
if self.page_pool.pages_count >= self.max_pages:
|
@@ -96,6 +97,7 @@ class AsyncSession(SyncSession):
|
|
96
97
|
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
97
98
|
)
|
98
99
|
|
100
|
+
assert self.context is not None, "Browser context not initialized"
|
99
101
|
page = await self.context.new_page()
|
100
102
|
page.set_default_navigation_timeout(timeout)
|
101
103
|
page.set_default_timeout(timeout)
|
@@ -111,6 +113,14 @@ class AsyncSession(SyncSession):
|
|
111
113
|
|
112
114
|
return self.page_pool.add_page(page)
|
113
115
|
|
116
|
+
def get_pool_stats(self) -> Dict[str, int]:
|
117
|
+
"""Get statistics about the current page pool"""
|
118
|
+
return {
|
119
|
+
"total_pages": self.page_pool.pages_count,
|
120
|
+
"busy_pages": self.page_pool.busy_count,
|
121
|
+
"max_pages": self.max_pages,
|
122
|
+
}
|
123
|
+
|
114
124
|
|
115
125
|
class DynamicSessionMixin:
|
116
126
|
def __validate__(self, **params):
|
@@ -138,11 +148,16 @@ class DynamicSessionMixin:
|
|
138
148
|
self.init_script = config.init_script
|
139
149
|
self.wait_selector_state = config.wait_selector_state
|
140
150
|
self.selector_config = config.selector_config
|
151
|
+
self.additional_args = config.additional_args
|
141
152
|
self.page_action = config.page_action
|
142
|
-
self.
|
153
|
+
self.user_data_dir = config.user_data_dir
|
154
|
+
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
143
155
|
self.__initiate_browser_options__()
|
144
156
|
|
145
157
|
def __initiate_browser_options__(self):
|
158
|
+
if TYPE_CHECKING:
|
159
|
+
assert isinstance(self.proxy, tuple)
|
160
|
+
|
146
161
|
if not self.cdp_url:
|
147
162
|
# `launch_options` is used with persistent context
|
148
163
|
self.launch_options = dict(
|
@@ -160,6 +175,8 @@ class DynamicSessionMixin:
|
|
160
175
|
)
|
161
176
|
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
162
177
|
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
178
|
+
self.launch_options["user_data_dir"] = self.user_data_dir
|
179
|
+
self.launch_options.update(cast(Dict, self.additional_args))
|
163
180
|
self.context_options = dict()
|
164
181
|
else:
|
165
182
|
# while `context_options` is left to be used when cdp mode is enabled
|
@@ -175,11 +192,12 @@ class DynamicSessionMixin:
|
|
175
192
|
)
|
176
193
|
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
177
194
|
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
195
|
+
self.context_options.update(cast(Dict, self.additional_args))
|
178
196
|
|
179
197
|
|
180
198
|
class StealthySessionMixin:
|
181
199
|
def __validate__(self, **params):
|
182
|
-
config = validate(params, model=CamoufoxConfig)
|
200
|
+
config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
|
183
201
|
|
184
202
|
self.max_pages = config.max_pages
|
185
203
|
self.headless = config.headless
|
@@ -208,15 +226,16 @@ class StealthySessionMixin:
|
|
208
226
|
self.selector_config = config.selector_config
|
209
227
|
self.additional_args = config.additional_args
|
210
228
|
self.page_action = config.page_action
|
211
|
-
self.
|
229
|
+
self.user_data_dir = config.user_data_dir
|
230
|
+
self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
|
212
231
|
self.__initiate_browser_options__()
|
213
232
|
|
214
233
|
def __initiate_browser_options__(self):
|
215
234
|
"""Initiate browser options."""
|
216
|
-
self.launch_options = generate_launch_options(
|
235
|
+
self.launch_options: Dict[str, Any] = generate_launch_options(
|
217
236
|
**{
|
218
237
|
"geoip": self.geoip,
|
219
|
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
238
|
+
"proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
|
220
239
|
"addons": self.addons,
|
221
240
|
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
222
241
|
"headless": self.headless,
|
@@ -226,7 +245,7 @@ class StealthySessionMixin:
|
|
226
245
|
"block_webrtc": self.block_webrtc,
|
227
246
|
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
228
247
|
"os": None if self.os_randomize else get_os_name(),
|
229
|
-
"user_data_dir":
|
248
|
+
"user_data_dir": self.user_data_dir,
|
230
249
|
"ff_version": __ff_version_str__,
|
231
250
|
"firefox_user_prefs": {
|
232
251
|
# This is what enabling `enable_cache` does internally, so we do it from here instead
|
@@ -236,7 +255,7 @@ class StealthySessionMixin:
|
|
236
255
|
"browser.cache.disk_cache_ssl": True,
|
237
256
|
"browser.cache.disk.smart_size.enabled": True,
|
238
257
|
},
|
239
|
-
**self.additional_args,
|
258
|
+
**cast(Dict, self.additional_args),
|
240
259
|
}
|
241
260
|
)
|
242
261
|
|
@@ -268,4 +287,9 @@ class StealthySessionMixin:
|
|
268
287
|
if f"cType: '{ctype}'" in page_content:
|
269
288
|
return ctype
|
270
289
|
|
290
|
+
# Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
|
291
|
+
selector = Selector(content=page_content)
|
292
|
+
if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
|
293
|
+
return "embedded"
|
294
|
+
|
271
295
|
return None
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from random import randint
|
1
2
|
from re import compile as re_compile
|
2
3
|
|
3
4
|
from playwright.sync_api import (
|
@@ -20,10 +21,12 @@ from ._validators import validate_fetch as _validate
|
|
20
21
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
21
22
|
from scrapling.core.utils import log
|
22
23
|
from scrapling.core._types import (
|
24
|
+
Any,
|
23
25
|
Dict,
|
24
26
|
List,
|
25
27
|
Optional,
|
26
28
|
Callable,
|
29
|
+
TYPE_CHECKING,
|
27
30
|
SelectorWaitStates,
|
28
31
|
)
|
29
32
|
from scrapling.engines.toolbelt.convertor import (
|
@@ -33,7 +36,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
33
36
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
34
37
|
|
35
38
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
36
|
-
_UNSET = object()
|
39
|
+
_UNSET: Any = object()
|
37
40
|
|
38
41
|
|
39
42
|
class StealthySession(StealthySessionMixin, SyncSession):
|
@@ -101,6 +104,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
101
104
|
os_randomize: bool = False,
|
102
105
|
disable_ads: bool = False,
|
103
106
|
geoip: bool = False,
|
107
|
+
user_data_dir: str = "",
|
104
108
|
selector_config: Optional[Dict] = None,
|
105
109
|
additional_args: Optional[Dict] = None,
|
106
110
|
):
|
@@ -116,7 +120,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
116
120
|
:param cookies: Set cookies for the next request.
|
117
121
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
118
122
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
119
|
-
:param solve_cloudflare: Solves all
|
123
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
120
124
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
121
125
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
122
126
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
@@ -133,6 +137,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
133
137
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
134
138
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
135
139
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
140
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
136
141
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
137
142
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
138
143
|
"""
|
@@ -156,6 +161,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
156
161
|
block_images=block_images,
|
157
162
|
block_webrtc=block_webrtc,
|
158
163
|
os_randomize=os_randomize,
|
164
|
+
user_data_dir=user_data_dir,
|
159
165
|
wait_selector=wait_selector,
|
160
166
|
google_search=google_search,
|
161
167
|
extra_headers=extra_headers,
|
@@ -170,9 +176,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
170
176
|
def __create__(self):
|
171
177
|
"""Create a browser for this instance and context."""
|
172
178
|
self.playwright = sync_playwright().start()
|
173
|
-
self.context = self.playwright.firefox.launch_persistent_context(
|
174
|
-
**self.launch_options
|
175
|
-
)
|
179
|
+
self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
|
176
180
|
|
177
181
|
if self.init_script: # pragma: no cover
|
178
182
|
self.context.add_init_script(path=self.init_script)
|
@@ -203,9 +207,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
203
207
|
self._closed = True
|
204
208
|
|
205
209
|
@staticmethod
|
206
|
-
def _get_page_content(page: Page) -> str
|
210
|
+
def _get_page_content(page: Page) -> str:
|
207
211
|
"""
|
208
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
212
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
209
213
|
:param page: The page to extract content from.
|
210
214
|
:return:
|
211
215
|
"""
|
@@ -215,6 +219,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
215
219
|
except PlaywrightError:
|
216
220
|
page.wait_for_timeout(1000)
|
217
221
|
continue
|
222
|
+
return "" # pyright: ignore
|
218
223
|
|
219
224
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
220
225
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
@@ -222,6 +227,10 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
222
227
|
:param page: The targeted page
|
223
228
|
:return:
|
224
229
|
"""
|
230
|
+
try:
|
231
|
+
page.wait_for_load_state("networkidle", timeout=5000)
|
232
|
+
except PlaywrightError:
|
233
|
+
pass
|
225
234
|
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
226
235
|
if not challenge_type:
|
227
236
|
log.error("No Cloudflare challenge found.")
|
@@ -237,26 +246,42 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
237
246
|
return
|
238
247
|
|
239
248
|
else:
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
249
|
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
250
|
+
if challenge_type != "embedded":
|
251
|
+
box_selector = ".main-content p+div>div>div"
|
252
|
+
while "Verifying you are human." in self._get_page_content(page):
|
253
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
254
|
+
page.wait_for_timeout(500)
|
255
|
+
|
256
|
+
outer_box = {}
|
244
257
|
iframe = page.frame(url=__CF_PATTERN__)
|
245
|
-
if iframe is None:
|
246
|
-
|
247
|
-
|
258
|
+
if iframe is not None:
|
259
|
+
iframe.wait_for_load_state(state="domcontentloaded")
|
260
|
+
iframe.wait_for_load_state("networkidle")
|
261
|
+
|
262
|
+
if challenge_type != "embedded":
|
263
|
+
while not iframe.frame_element().is_visible():
|
264
|
+
# Double-checking that the iframe is loaded
|
265
|
+
page.wait_for_timeout(500)
|
266
|
+
outer_box: Any = iframe.frame_element().bounding_box()
|
248
267
|
|
249
|
-
|
250
|
-
|
251
|
-
page.wait_for_timeout(500)
|
268
|
+
if not iframe or not outer_box:
|
269
|
+
outer_box: Any = page.locator(box_selector).last.bounding_box()
|
252
270
|
|
253
271
|
# Calculate the Captcha coordinates for any viewport
|
254
|
-
|
255
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
272
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
256
273
|
|
257
274
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
258
275
|
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
259
|
-
page.
|
276
|
+
page.wait_for_load_state("networkidle")
|
277
|
+
if iframe is not None:
|
278
|
+
# Wait for the frame to be removed from the page
|
279
|
+
while iframe in page.frames:
|
280
|
+
page.wait_for_timeout(100)
|
281
|
+
if challenge_type != "embedded":
|
282
|
+
page.locator(box_selector).last.wait_for(state="detached")
|
283
|
+
page.locator(".zone-name-title").wait_for(state="hidden")
|
284
|
+
page.wait_for_load_state(state="load")
|
260
285
|
page.wait_for_load_state(state="domcontentloaded")
|
261
286
|
|
262
287
|
log.info("Cloudflare captcha is solved")
|
@@ -293,7 +318,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
293
318
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
294
319
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
295
320
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
296
|
-
:param solve_cloudflare: Solves all
|
321
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
297
322
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
298
323
|
:return: A `Response` object.
|
299
324
|
"""
|
@@ -328,6 +353,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
328
353
|
if (
|
329
354
|
finished_response.request.resource_type == "document"
|
330
355
|
and finished_response.request.is_navigation_request()
|
356
|
+
and finished_response.request.frame == page_info.page.main_frame
|
331
357
|
):
|
332
358
|
final_response = finished_response
|
333
359
|
|
@@ -380,7 +406,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
380
406
|
page_info.page, first_response, final_response, params.selector_config
|
381
407
|
)
|
382
408
|
|
383
|
-
# Close the page
|
409
|
+
# Close the page to free up resources
|
384
410
|
page_info.page.close()
|
385
411
|
self.page_pool.pages.remove(page_info)
|
386
412
|
|
@@ -420,6 +446,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
420
446
|
os_randomize: bool = False,
|
421
447
|
disable_ads: bool = False,
|
422
448
|
geoip: bool = False,
|
449
|
+
user_data_dir: str = "",
|
423
450
|
selector_config: Optional[Dict] = None,
|
424
451
|
additional_args: Optional[Dict] = None,
|
425
452
|
):
|
@@ -435,7 +462,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
435
462
|
:param cookies: Set cookies for the next request.
|
436
463
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
437
464
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
438
|
-
:param solve_cloudflare: Solves all
|
465
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
439
466
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
440
467
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
441
468
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
@@ -453,6 +480,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
453
480
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
454
481
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
455
482
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
483
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
456
484
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
457
485
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
458
486
|
"""
|
@@ -478,6 +506,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
478
506
|
wait_selector=wait_selector,
|
479
507
|
google_search=google_search,
|
480
508
|
extra_headers=extra_headers,
|
509
|
+
user_data_dir=user_data_dir,
|
481
510
|
additional_args=additional_args,
|
482
511
|
selector_config=selector_config,
|
483
512
|
solve_cloudflare=solve_cloudflare,
|
@@ -497,7 +526,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
497
526
|
await self.context.add_init_script(path=self.init_script)
|
498
527
|
|
499
528
|
if self.cookies:
|
500
|
-
await self.context.add_cookies(self.cookies)
|
529
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
|
501
530
|
|
502
531
|
async def __aenter__(self):
|
503
532
|
await self.__create__()
|
@@ -513,18 +542,18 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
513
542
|
|
514
543
|
if self.context:
|
515
544
|
await self.context.close()
|
516
|
-
self.context = None
|
545
|
+
self.context = None # pyright: ignore
|
517
546
|
|
518
547
|
if self.playwright:
|
519
548
|
await self.playwright.stop()
|
520
|
-
self.playwright = None
|
549
|
+
self.playwright = None # pyright: ignore
|
521
550
|
|
522
551
|
self._closed = True
|
523
552
|
|
524
553
|
@staticmethod
|
525
|
-
async def _get_page_content(page: async_Page) -> str
|
554
|
+
async def _get_page_content(page: async_Page) -> str:
|
526
555
|
"""
|
527
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
556
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
528
557
|
:param page: The page to extract content from.
|
529
558
|
:return:
|
530
559
|
"""
|
@@ -534,6 +563,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
534
563
|
except PlaywrightError:
|
535
564
|
await page.wait_for_timeout(1000)
|
536
565
|
continue
|
566
|
+
return "" # pyright: ignore
|
537
567
|
|
538
568
|
async def _solve_cloudflare(self, page: async_Page):
|
539
569
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
@@ -541,6 +571,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
541
571
|
:param page: The async targeted page
|
542
572
|
:return:
|
543
573
|
"""
|
574
|
+
try:
|
575
|
+
await page.wait_for_load_state("networkidle", timeout=5000)
|
576
|
+
except PlaywrightError:
|
577
|
+
pass
|
544
578
|
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
545
579
|
if not challenge_type:
|
546
580
|
log.error("No Cloudflare challenge found.")
|
@@ -556,26 +590,42 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
556
590
|
return
|
557
591
|
|
558
592
|
else:
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
593
|
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
594
|
+
if challenge_type != "embedded":
|
595
|
+
box_selector = ".main-content p+div>div>div"
|
596
|
+
while "Verifying you are human." in (await self._get_page_content(page)):
|
597
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
598
|
+
await page.wait_for_timeout(500)
|
599
|
+
|
600
|
+
outer_box = {}
|
563
601
|
iframe = page.frame(url=__CF_PATTERN__)
|
564
|
-
if iframe is None:
|
565
|
-
|
566
|
-
|
602
|
+
if iframe is not None:
|
603
|
+
await iframe.wait_for_load_state(state="domcontentloaded")
|
604
|
+
await iframe.wait_for_load_state("networkidle")
|
567
605
|
|
568
|
-
|
569
|
-
|
570
|
-
|
606
|
+
if challenge_type != "embedded":
|
607
|
+
while not await (await iframe.frame_element()).is_visible():
|
608
|
+
# Double-checking that the iframe is loaded
|
609
|
+
await page.wait_for_timeout(500)
|
610
|
+
outer_box: Any = await (await iframe.frame_element()).bounding_box()
|
611
|
+
|
612
|
+
if not iframe or not outer_box:
|
613
|
+
outer_box: Any = await page.locator(box_selector).last.bounding_box()
|
571
614
|
|
572
615
|
# Calculate the Captcha coordinates for any viewport
|
573
|
-
|
574
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
616
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
575
617
|
|
576
618
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
577
619
|
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
578
|
-
await page.
|
620
|
+
await page.wait_for_load_state("networkidle")
|
621
|
+
if iframe is not None:
|
622
|
+
# Wait for the frame to be removed from the page
|
623
|
+
while iframe in page.frames:
|
624
|
+
await page.wait_for_timeout(100)
|
625
|
+
if challenge_type != "embedded":
|
626
|
+
await page.locator(box_selector).wait_for(state="detached")
|
627
|
+
await page.locator(".zone-name-title").wait_for(state="hidden")
|
628
|
+
await page.wait_for_load_state(state="load")
|
579
629
|
await page.wait_for_load_state(state="domcontentloaded")
|
580
630
|
|
581
631
|
log.info("Cloudflare captcha is solved")
|
@@ -612,7 +662,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
612
662
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
613
663
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
614
664
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
615
|
-
:param solve_cloudflare: Solves all
|
665
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
616
666
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
617
667
|
:return: A `Response` object.
|
618
668
|
"""
|
@@ -647,12 +697,17 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
647
697
|
if (
|
648
698
|
finished_response.request.resource_type == "document"
|
649
699
|
and finished_response.request.is_navigation_request()
|
700
|
+
and finished_response.request.frame == page_info.page.main_frame
|
650
701
|
):
|
651
702
|
final_response = finished_response
|
652
703
|
|
653
704
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
654
705
|
page_info.mark_busy(url=url)
|
655
706
|
|
707
|
+
if TYPE_CHECKING:
|
708
|
+
if not isinstance(page_info.page, async_Page):
|
709
|
+
raise TypeError
|
710
|
+
|
656
711
|
try:
|
657
712
|
# Navigate to URL and wait for a specified state
|
658
713
|
page_info.page.on("response", handle_response)
|
@@ -701,7 +756,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
701
756
|
page_info.page, first_response, final_response, params.selector_config
|
702
757
|
)
|
703
758
|
|
704
|
-
# Close the page
|
759
|
+
# Close the page to free up resources
|
705
760
|
await page_info.page.close()
|
706
761
|
self.page_pool.pages.remove(page_info)
|
707
762
|
|