scrapling 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/_browsers/_base.py +140 -9
- scrapling/engines/_browsers/_camoufox.py +47 -164
- scrapling/engines/_browsers/_config_tools.py +8 -2
- scrapling/engines/_browsers/_controllers.py +25 -96
- scrapling/engines/_browsers/_validators.py +72 -61
- scrapling/engines/toolbelt/convertor.py +37 -2
- scrapling/engines/toolbelt/custom.py +0 -12
- scrapling/engines/toolbelt/fingerprints.py +6 -8
- scrapling/fetchers/chrome.py +6 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/METADATA +6 -4
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/RECORD +16 -16
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
|
@@ -2,17 +2,27 @@ from time import time
|
|
|
2
2
|
from asyncio import sleep as asyncio_sleep, Lock
|
|
3
3
|
|
|
4
4
|
from camoufox import DefaultAddons
|
|
5
|
-
from playwright.sync_api import
|
|
5
|
+
from playwright.sync_api import (
|
|
6
|
+
Page,
|
|
7
|
+
Frame,
|
|
8
|
+
BrowserContext,
|
|
9
|
+
Playwright,
|
|
10
|
+
Response as SyncPlaywrightResponse,
|
|
11
|
+
)
|
|
6
12
|
from playwright.async_api import (
|
|
7
|
-
|
|
13
|
+
Page as AsyncPage,
|
|
14
|
+
Frame as AsyncFrame,
|
|
8
15
|
Playwright as AsyncPlaywright,
|
|
16
|
+
Response as AsyncPlaywrightResponse,
|
|
17
|
+
BrowserContext as AsyncBrowserContext,
|
|
9
18
|
)
|
|
19
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
10
20
|
from camoufox.pkgman import installed_verstr as camoufox_version
|
|
11
21
|
from camoufox.utils import launch_options as generate_launch_options
|
|
12
22
|
|
|
13
23
|
from ._page import PageInfo, PagePool
|
|
14
24
|
from scrapling.parser import Selector
|
|
15
|
-
from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
|
|
25
|
+
from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING
|
|
16
26
|
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
|
17
27
|
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
|
18
28
|
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
|
@@ -26,10 +36,35 @@ class SyncSession:
|
|
|
26
36
|
self.max_pages = max_pages
|
|
27
37
|
self.page_pool = PagePool(max_pages)
|
|
28
38
|
self._max_wait_for_page = 60
|
|
29
|
-
self.playwright:
|
|
30
|
-
self.context:
|
|
39
|
+
self.playwright: Playwright | Any = None
|
|
40
|
+
self.context: BrowserContext | Any = None
|
|
31
41
|
self._closed = False
|
|
32
42
|
|
|
43
|
+
def __create__(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
def close(self): # pragma: no cover
|
|
47
|
+
"""Close all resources"""
|
|
48
|
+
if self._closed:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
if self.context:
|
|
52
|
+
self.context.close()
|
|
53
|
+
self.context = None
|
|
54
|
+
|
|
55
|
+
if self.playwright:
|
|
56
|
+
self.playwright.stop()
|
|
57
|
+
self.playwright = None # pyright: ignore
|
|
58
|
+
|
|
59
|
+
self._closed = True
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
self.__create__()
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
66
|
+
self.close()
|
|
67
|
+
|
|
33
68
|
def _get_page(
|
|
34
69
|
self,
|
|
35
70
|
timeout: int | float,
|
|
@@ -53,7 +88,9 @@ class SyncSession:
|
|
|
53
88
|
for script in _compiled_stealth_scripts():
|
|
54
89
|
page.add_init_script(script=script)
|
|
55
90
|
|
|
56
|
-
|
|
91
|
+
page_info = self.page_pool.add_page(page)
|
|
92
|
+
page_info.mark_busy()
|
|
93
|
+
return page_info
|
|
57
94
|
|
|
58
95
|
def get_pool_stats(self) -> Dict[str, int]:
|
|
59
96
|
"""Get statistics about the current page pool"""
|
|
@@ -63,17 +100,76 @@ class SyncSession:
|
|
|
63
100
|
"max_pages": self.max_pages,
|
|
64
101
|
}
|
|
65
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
|
|
105
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
106
|
+
try:
|
|
107
|
+
page.wait_for_load_state("networkidle", timeout=timeout)
|
|
108
|
+
except PlaywrightError:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
|
|
112
|
+
page.wait_for_load_state(state="load")
|
|
113
|
+
if load_dom:
|
|
114
|
+
page.wait_for_load_state(state="domcontentloaded")
|
|
115
|
+
if network_idle:
|
|
116
|
+
self._wait_for_networkidle(page)
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
120
|
+
"""Create a response handler that captures the final navigation response.
|
|
121
|
+
|
|
122
|
+
:param page_info: The PageInfo object containing the page
|
|
123
|
+
:param response_container: A list to store the final response (mutable container)
|
|
124
|
+
:return: A callback function for page.on("response", ...)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
128
|
+
if (
|
|
129
|
+
finished_response.request.resource_type == "document"
|
|
130
|
+
and finished_response.request.is_navigation_request()
|
|
131
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
132
|
+
):
|
|
133
|
+
response_container[0] = finished_response
|
|
134
|
+
|
|
135
|
+
return handle_response
|
|
136
|
+
|
|
66
137
|
|
|
67
138
|
class AsyncSession:
|
|
68
139
|
def __init__(self, max_pages: int = 1):
|
|
69
140
|
self.max_pages = max_pages
|
|
70
141
|
self.page_pool = PagePool(max_pages)
|
|
71
142
|
self._max_wait_for_page = 60
|
|
72
|
-
self.playwright:
|
|
73
|
-
self.context:
|
|
143
|
+
self.playwright: AsyncPlaywright | Any = None
|
|
144
|
+
self.context: AsyncBrowserContext | Any = None
|
|
74
145
|
self._closed = False
|
|
75
146
|
self._lock = Lock()
|
|
76
147
|
|
|
148
|
+
async def __create__(self):
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
async def close(self):
|
|
152
|
+
"""Close all resources"""
|
|
153
|
+
if self._closed: # pragma: no cover
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
if self.context:
|
|
157
|
+
await self.context.close()
|
|
158
|
+
self.context = None # pyright: ignore
|
|
159
|
+
|
|
160
|
+
if self.playwright:
|
|
161
|
+
await self.playwright.stop()
|
|
162
|
+
self.playwright = None # pyright: ignore
|
|
163
|
+
|
|
164
|
+
self._closed = True
|
|
165
|
+
|
|
166
|
+
async def __aenter__(self):
|
|
167
|
+
await self.__create__()
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
171
|
+
await self.close()
|
|
172
|
+
|
|
77
173
|
async def _get_page(
|
|
78
174
|
self,
|
|
79
175
|
timeout: int | float,
|
|
@@ -97,7 +193,6 @@ class AsyncSession:
|
|
|
97
193
|
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
|
98
194
|
)
|
|
99
195
|
|
|
100
|
-
assert self.context is not None, "Browser context not initialized"
|
|
101
196
|
page = await self.context.new_page()
|
|
102
197
|
page.set_default_navigation_timeout(timeout)
|
|
103
198
|
page.set_default_timeout(timeout)
|
|
@@ -121,6 +216,40 @@ class AsyncSession:
|
|
|
121
216
|
"max_pages": self.max_pages,
|
|
122
217
|
}
|
|
123
218
|
|
|
219
|
+
@staticmethod
|
|
220
|
+
async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
|
|
221
|
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
|
222
|
+
try:
|
|
223
|
+
await page.wait_for_load_state("networkidle", timeout=timeout)
|
|
224
|
+
except PlaywrightError:
|
|
225
|
+
pass
|
|
226
|
+
|
|
227
|
+
async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
|
|
228
|
+
await page.wait_for_load_state(state="load")
|
|
229
|
+
if load_dom:
|
|
230
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
|
231
|
+
if network_idle:
|
|
232
|
+
await self._wait_for_networkidle(page)
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
|
|
236
|
+
"""Create an async response handler that captures the final navigation response.
|
|
237
|
+
|
|
238
|
+
:param page_info: The PageInfo object containing the page
|
|
239
|
+
:param response_container: A list to store the final response (mutable container)
|
|
240
|
+
:return: A callback function for page.on("response", ...)
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
244
|
+
if (
|
|
245
|
+
finished_response.request.resource_type == "document"
|
|
246
|
+
and finished_response.request.is_navigation_request()
|
|
247
|
+
and finished_response.request.frame == page_info.page.main_frame
|
|
248
|
+
):
|
|
249
|
+
response_container[0] = finished_response
|
|
250
|
+
|
|
251
|
+
return handle_response
|
|
252
|
+
|
|
124
253
|
|
|
125
254
|
class DynamicSessionMixin:
|
|
126
255
|
def __validate__(self, **params):
|
|
@@ -147,6 +276,7 @@ class DynamicSessionMixin:
|
|
|
147
276
|
self.wait_selector = config.wait_selector
|
|
148
277
|
self.init_script = config.init_script
|
|
149
278
|
self.wait_selector_state = config.wait_selector_state
|
|
279
|
+
self.extra_flags = config.extra_flags
|
|
150
280
|
self.selector_config = config.selector_config
|
|
151
281
|
self.additional_args = config.additional_args
|
|
152
282
|
self.page_action = config.page_action
|
|
@@ -171,6 +301,7 @@ class DynamicSessionMixin:
|
|
|
171
301
|
self.stealth,
|
|
172
302
|
self.hide_canvas,
|
|
173
303
|
self.disable_webgl,
|
|
304
|
+
tuple(self.extra_flags) if self.extra_flags else tuple(),
|
|
174
305
|
)
|
|
175
306
|
)
|
|
176
307
|
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
|
@@ -2,22 +2,19 @@ from random import randint
|
|
|
2
2
|
from re import compile as re_compile
|
|
3
3
|
|
|
4
4
|
from playwright.sync_api import (
|
|
5
|
-
Response as SyncPlaywrightResponse,
|
|
6
|
-
sync_playwright,
|
|
7
|
-
Locator,
|
|
8
5
|
Page,
|
|
6
|
+
Locator,
|
|
7
|
+
sync_playwright,
|
|
9
8
|
)
|
|
10
9
|
from playwright.async_api import (
|
|
11
10
|
async_playwright,
|
|
12
|
-
Response as AsyncPlaywrightResponse,
|
|
13
|
-
BrowserContext as AsyncBrowserContext,
|
|
14
|
-
Playwright as AsyncPlaywright,
|
|
15
|
-
Locator as AsyncLocator,
|
|
16
11
|
Page as async_Page,
|
|
12
|
+
Locator as AsyncLocator,
|
|
13
|
+
Playwright as AsyncPlaywright,
|
|
14
|
+
BrowserContext as AsyncBrowserContext,
|
|
17
15
|
)
|
|
18
|
-
from playwright._impl._errors import Error as PlaywrightError
|
|
19
16
|
|
|
20
|
-
from ._validators import validate_fetch as _validate
|
|
17
|
+
from ._validators import validate_fetch as _validate, CamoufoxConfig
|
|
21
18
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
|
22
19
|
from scrapling.core.utils import log
|
|
23
20
|
from scrapling.core._types import (
|
|
@@ -184,61 +181,21 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
184
181
|
if self.cookies: # pragma: no cover
|
|
185
182
|
self.context.add_cookies(self.cookies)
|
|
186
183
|
|
|
187
|
-
def __enter__(self): # pragma: no cover
|
|
188
|
-
self.__create__()
|
|
189
|
-
return self
|
|
190
|
-
|
|
191
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
192
|
-
self.close()
|
|
193
|
-
|
|
194
|
-
def close(self): # pragma: no cover
|
|
195
|
-
"""Close all resources"""
|
|
196
|
-
if self._closed: # pragma: no cover
|
|
197
|
-
return
|
|
198
|
-
|
|
199
|
-
if self.context:
|
|
200
|
-
self.context.close()
|
|
201
|
-
self.context = None
|
|
202
|
-
|
|
203
|
-
if self.playwright:
|
|
204
|
-
self.playwright.stop()
|
|
205
|
-
self.playwright = None
|
|
206
|
-
|
|
207
|
-
self._closed = True
|
|
208
|
-
|
|
209
|
-
@staticmethod
|
|
210
|
-
def _get_page_content(page: Page) -> str:
|
|
211
|
-
"""
|
|
212
|
-
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
213
|
-
:param page: The page to extract content from.
|
|
214
|
-
:return:
|
|
215
|
-
"""
|
|
216
|
-
while True:
|
|
217
|
-
try:
|
|
218
|
-
return page.content() or ""
|
|
219
|
-
except PlaywrightError:
|
|
220
|
-
page.wait_for_timeout(1000)
|
|
221
|
-
continue
|
|
222
|
-
return "" # pyright: ignore
|
|
223
|
-
|
|
224
184
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
|
225
185
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
|
226
186
|
|
|
227
187
|
:param page: The targeted page
|
|
228
188
|
:return:
|
|
229
189
|
"""
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
except PlaywrightError:
|
|
233
|
-
pass
|
|
234
|
-
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
|
190
|
+
self._wait_for_networkidle(page, timeout=5000)
|
|
191
|
+
challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
|
|
235
192
|
if not challenge_type:
|
|
236
193
|
log.error("No Cloudflare challenge found.")
|
|
237
194
|
return
|
|
238
195
|
else:
|
|
239
196
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
|
240
197
|
if challenge_type == "non-interactive":
|
|
241
|
-
while "<title>Just a moment...</title>" in (
|
|
198
|
+
while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
|
|
242
199
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
|
243
200
|
page.wait_for_timeout(1000)
|
|
244
201
|
page.wait_for_load_state()
|
|
@@ -249,15 +206,14 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
249
206
|
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
|
250
207
|
if challenge_type != "embedded":
|
|
251
208
|
box_selector = ".main-content p+div>div>div"
|
|
252
|
-
while "Verifying you are human." in
|
|
209
|
+
while "Verifying you are human." in ResponseFactory._get_page_content(page):
|
|
253
210
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
|
254
211
|
page.wait_for_timeout(500)
|
|
255
212
|
|
|
256
213
|
outer_box = {}
|
|
257
214
|
iframe = page.frame(url=__CF_PATTERN__)
|
|
258
215
|
if iframe is not None:
|
|
259
|
-
|
|
260
|
-
iframe.wait_for_load_state("networkidle")
|
|
216
|
+
self._wait_for_page_stability(iframe, True, True)
|
|
261
217
|
|
|
262
218
|
if challenge_type != "embedded":
|
|
263
219
|
while not iframe.frame_element().is_visible():
|
|
@@ -273,16 +229,20 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
273
229
|
|
|
274
230
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
|
275
231
|
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
|
276
|
-
|
|
232
|
+
self._wait_for_networkidle(page)
|
|
277
233
|
if iframe is not None:
|
|
278
|
-
# Wait for the frame to be removed from the page
|
|
234
|
+
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
235
|
+
attempts = 0
|
|
279
236
|
while iframe in page.frames:
|
|
237
|
+
if attempts >= 300:
|
|
238
|
+
log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
|
|
239
|
+
break
|
|
280
240
|
page.wait_for_timeout(100)
|
|
241
|
+
attempts += 1
|
|
281
242
|
if challenge_type != "embedded":
|
|
282
243
|
page.locator(box_selector).last.wait_for(state="detached")
|
|
283
244
|
page.locator(".zone-name-title").wait_for(state="hidden")
|
|
284
|
-
|
|
285
|
-
page.wait_for_load_state(state="domcontentloaded")
|
|
245
|
+
self._wait_for_page_stability(page, True, False)
|
|
286
246
|
|
|
287
247
|
log.info("Cloudflare captcha is solved")
|
|
288
248
|
return
|
|
@@ -337,38 +297,26 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
337
297
|
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
|
338
298
|
("selector_config", selector_config, self.selector_config),
|
|
339
299
|
],
|
|
300
|
+
CamoufoxConfig,
|
|
340
301
|
_UNSET,
|
|
341
302
|
)
|
|
342
303
|
|
|
343
304
|
if self._closed: # pragma: no cover
|
|
344
305
|
raise RuntimeError("Context manager has been closed")
|
|
345
306
|
|
|
346
|
-
final_response = None
|
|
347
307
|
referer = (
|
|
348
308
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
349
309
|
)
|
|
350
310
|
|
|
351
|
-
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
352
|
-
nonlocal final_response
|
|
353
|
-
if (
|
|
354
|
-
finished_response.request.resource_type == "document"
|
|
355
|
-
and finished_response.request.is_navigation_request()
|
|
356
|
-
and finished_response.request.frame == page_info.page.main_frame
|
|
357
|
-
):
|
|
358
|
-
final_response = finished_response
|
|
359
|
-
|
|
360
311
|
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
361
|
-
|
|
312
|
+
final_response = [None]
|
|
313
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
362
314
|
|
|
363
315
|
try: # pragma: no cover
|
|
364
316
|
# Navigate to URL and wait for a specified state
|
|
365
317
|
page_info.page.on("response", handle_response)
|
|
366
318
|
first_response = page_info.page.goto(url, referer=referer)
|
|
367
|
-
|
|
368
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
369
|
-
|
|
370
|
-
if params.network_idle:
|
|
371
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
319
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
372
320
|
|
|
373
321
|
if not first_response:
|
|
374
322
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -376,11 +324,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
376
324
|
if params.solve_cloudflare:
|
|
377
325
|
self._solve_cloudflare(page_info.page)
|
|
378
326
|
# Make sure the page is fully loaded after the captcha
|
|
379
|
-
page_info.page.
|
|
380
|
-
if params.load_dom:
|
|
381
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
382
|
-
if params.network_idle:
|
|
383
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
327
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
384
328
|
|
|
385
329
|
if params.page_action:
|
|
386
330
|
try:
|
|
@@ -393,17 +337,13 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
393
337
|
waiter: Locator = page_info.page.locator(params.wait_selector)
|
|
394
338
|
waiter.first.wait_for(state=params.wait_selector_state)
|
|
395
339
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
396
|
-
page_info.page.
|
|
397
|
-
if params.load_dom:
|
|
398
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
399
|
-
if params.network_idle:
|
|
400
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
340
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
401
341
|
except Exception as e:
|
|
402
342
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
403
343
|
|
|
404
344
|
page_info.page.wait_for_timeout(params.wait)
|
|
405
345
|
response = ResponseFactory.from_playwright_response(
|
|
406
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
346
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
407
347
|
)
|
|
408
348
|
|
|
409
349
|
# Close the page to free up resources
|
|
@@ -528,61 +468,21 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
528
468
|
if self.cookies:
|
|
529
469
|
await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
|
|
530
470
|
|
|
531
|
-
async def
|
|
532
|
-
await self.__create__()
|
|
533
|
-
return self
|
|
534
|
-
|
|
535
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
536
|
-
await self.close()
|
|
537
|
-
|
|
538
|
-
async def close(self):
|
|
539
|
-
"""Close all resources"""
|
|
540
|
-
if self._closed: # pragma: no cover
|
|
541
|
-
return
|
|
542
|
-
|
|
543
|
-
if self.context:
|
|
544
|
-
await self.context.close()
|
|
545
|
-
self.context = None # pyright: ignore
|
|
546
|
-
|
|
547
|
-
if self.playwright:
|
|
548
|
-
await self.playwright.stop()
|
|
549
|
-
self.playwright = None # pyright: ignore
|
|
550
|
-
|
|
551
|
-
self._closed = True
|
|
552
|
-
|
|
553
|
-
@staticmethod
|
|
554
|
-
async def _get_page_content(page: async_Page) -> str:
|
|
555
|
-
"""
|
|
556
|
-
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
557
|
-
:param page: The page to extract content from.
|
|
558
|
-
:return:
|
|
559
|
-
"""
|
|
560
|
-
while True:
|
|
561
|
-
try:
|
|
562
|
-
return (await page.content()) or ""
|
|
563
|
-
except PlaywrightError:
|
|
564
|
-
await page.wait_for_timeout(1000)
|
|
565
|
-
continue
|
|
566
|
-
return "" # pyright: ignore
|
|
567
|
-
|
|
568
|
-
async def _solve_cloudflare(self, page: async_Page):
|
|
471
|
+
async def _solve_cloudflare(self, page: async_Page): # pragma: no cover
|
|
569
472
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
|
570
473
|
|
|
571
474
|
:param page: The async targeted page
|
|
572
475
|
:return:
|
|
573
476
|
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
except PlaywrightError:
|
|
577
|
-
pass
|
|
578
|
-
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
|
477
|
+
await self._wait_for_networkidle(page, timeout=5000)
|
|
478
|
+
challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
|
|
579
479
|
if not challenge_type:
|
|
580
480
|
log.error("No Cloudflare challenge found.")
|
|
581
481
|
return
|
|
582
482
|
else:
|
|
583
483
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
|
584
484
|
if challenge_type == "non-interactive": # pragma: no cover
|
|
585
|
-
while "<title>Just a moment...</title>" in (await
|
|
485
|
+
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
|
586
486
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
|
587
487
|
await page.wait_for_timeout(1000)
|
|
588
488
|
await page.wait_for_load_state()
|
|
@@ -593,15 +493,14 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
593
493
|
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
|
594
494
|
if challenge_type != "embedded":
|
|
595
495
|
box_selector = ".main-content p+div>div>div"
|
|
596
|
-
while "Verifying you are human." in (await
|
|
496
|
+
while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
|
|
597
497
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
|
598
498
|
await page.wait_for_timeout(500)
|
|
599
499
|
|
|
600
500
|
outer_box = {}
|
|
601
501
|
iframe = page.frame(url=__CF_PATTERN__)
|
|
602
502
|
if iframe is not None:
|
|
603
|
-
await
|
|
604
|
-
await iframe.wait_for_load_state("networkidle")
|
|
503
|
+
await self._wait_for_page_stability(iframe, True, True)
|
|
605
504
|
|
|
606
505
|
if challenge_type != "embedded":
|
|
607
506
|
while not await (await iframe.frame_element()).is_visible():
|
|
@@ -617,16 +516,20 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
617
516
|
|
|
618
517
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
|
619
518
|
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
|
620
|
-
await
|
|
519
|
+
await self._wait_for_networkidle(page)
|
|
621
520
|
if iframe is not None:
|
|
622
|
-
# Wait for the frame to be removed from the page
|
|
521
|
+
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
522
|
+
attempts = 0
|
|
623
523
|
while iframe in page.frames:
|
|
524
|
+
if attempts >= 300:
|
|
525
|
+
log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
|
|
526
|
+
break
|
|
624
527
|
await page.wait_for_timeout(100)
|
|
528
|
+
attempts += 1
|
|
625
529
|
if challenge_type != "embedded":
|
|
626
530
|
await page.locator(box_selector).wait_for(state="detached")
|
|
627
531
|
await page.locator(".zone-name-title").wait_for(state="hidden")
|
|
628
|
-
await
|
|
629
|
-
await page.wait_for_load_state(state="domcontentloaded")
|
|
532
|
+
await self._wait_for_page_stability(page, True, False)
|
|
630
533
|
|
|
631
534
|
log.info("Cloudflare captcha is solved")
|
|
632
535
|
return
|
|
@@ -681,28 +584,20 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
681
584
|
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
|
682
585
|
("selector_config", selector_config, self.selector_config),
|
|
683
586
|
],
|
|
587
|
+
CamoufoxConfig,
|
|
684
588
|
_UNSET,
|
|
685
589
|
)
|
|
686
590
|
|
|
687
591
|
if self._closed: # pragma: no cover
|
|
688
592
|
raise RuntimeError("Context manager has been closed")
|
|
689
593
|
|
|
690
|
-
final_response = None
|
|
691
594
|
referer = (
|
|
692
595
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
693
596
|
)
|
|
694
597
|
|
|
695
|
-
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
696
|
-
nonlocal final_response
|
|
697
|
-
if (
|
|
698
|
-
finished_response.request.resource_type == "document"
|
|
699
|
-
and finished_response.request.is_navigation_request()
|
|
700
|
-
and finished_response.request.frame == page_info.page.main_frame
|
|
701
|
-
):
|
|
702
|
-
final_response = finished_response
|
|
703
|
-
|
|
704
598
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
705
|
-
|
|
599
|
+
final_response = [None]
|
|
600
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
706
601
|
|
|
707
602
|
if TYPE_CHECKING:
|
|
708
603
|
if not isinstance(page_info.page, async_Page):
|
|
@@ -712,11 +607,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
712
607
|
# Navigate to URL and wait for a specified state
|
|
713
608
|
page_info.page.on("response", handle_response)
|
|
714
609
|
first_response = await page_info.page.goto(url, referer=referer)
|
|
715
|
-
|
|
716
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
717
|
-
|
|
718
|
-
if params.network_idle:
|
|
719
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
610
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
720
611
|
|
|
721
612
|
if not first_response:
|
|
722
613
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -724,11 +615,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
724
615
|
if params.solve_cloudflare:
|
|
725
616
|
await self._solve_cloudflare(page_info.page)
|
|
726
617
|
# Make sure the page is fully loaded after the captcha
|
|
727
|
-
await page_info.page.
|
|
728
|
-
if params.load_dom:
|
|
729
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
730
|
-
if params.network_idle:
|
|
731
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
618
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
732
619
|
|
|
733
620
|
if params.page_action:
|
|
734
621
|
try:
|
|
@@ -741,11 +628,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
741
628
|
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
|
742
629
|
await waiter.first.wait_for(state=params.wait_selector_state)
|
|
743
630
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
744
|
-
await page_info.page.
|
|
745
|
-
if params.load_dom:
|
|
746
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
747
|
-
if params.network_idle:
|
|
748
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
631
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
749
632
|
except Exception as e:
|
|
750
633
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
751
634
|
|
|
@@ -753,7 +636,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
753
636
|
|
|
754
637
|
# Create response object
|
|
755
638
|
response = await ResponseFactory.from_async_playwright_response(
|
|
756
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
639
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
757
640
|
)
|
|
758
641
|
|
|
759
642
|
# Close the page to free up resources
|
|
@@ -70,12 +70,17 @@ def _launch_kwargs(
|
|
|
70
70
|
stealth,
|
|
71
71
|
hide_canvas,
|
|
72
72
|
disable_webgl,
|
|
73
|
+
extra_flags: Tuple,
|
|
73
74
|
) -> Tuple:
|
|
74
75
|
"""Creates the arguments we will use while launching playwright's browser"""
|
|
76
|
+
base_args = DEFAULT_FLAGS
|
|
77
|
+
if extra_flags:
|
|
78
|
+
base_args = base_args + extra_flags
|
|
79
|
+
|
|
75
80
|
launch_kwargs = {
|
|
76
81
|
"locale": locale,
|
|
77
82
|
"headless": headless,
|
|
78
|
-
"args":
|
|
83
|
+
"args": base_args,
|
|
79
84
|
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
|
80
85
|
"proxy": proxy or tuple(),
|
|
81
86
|
"device_scale_factor": 2,
|
|
@@ -85,9 +90,10 @@ def _launch_kwargs(
|
|
|
85
90
|
"user_agent": useragent or __default_useragent__,
|
|
86
91
|
}
|
|
87
92
|
if stealth:
|
|
93
|
+
stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
|
|
88
94
|
launch_kwargs.update(
|
|
89
95
|
{
|
|
90
|
-
"args":
|
|
96
|
+
"args": stealth_args,
|
|
91
97
|
"chromium_sandbox": True,
|
|
92
98
|
"is_mobile": False,
|
|
93
99
|
"has_touch": False,
|