scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +175 -21
- scrapling/engines/_browsers/_camoufox.py +95 -171
- scrapling/engines/_browsers/_config_tools.py +9 -3
- scrapling/engines/_browsers/_controllers.py +51 -101
- scrapling/engines/_browsers/_validators.py +95 -63
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +48 -15
- scrapling/engines/toolbelt/custom.py +6 -21
- scrapling/engines/toolbelt/fingerprints.py +14 -9
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +15 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/METADATA +7 -6
- scrapling-0.3.8.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,29 @@
|
|
|
1
|
+
from random import randint
|
|
1
2
|
from re import compile as re_compile
|
|
2
3
|
|
|
3
4
|
from playwright.sync_api import (
|
|
4
|
-
Response as SyncPlaywrightResponse,
|
|
5
|
-
sync_playwright,
|
|
6
|
-
Locator,
|
|
7
5
|
Page,
|
|
6
|
+
Locator,
|
|
7
|
+
sync_playwright,
|
|
8
8
|
)
|
|
9
9
|
from playwright.async_api import (
|
|
10
10
|
async_playwright,
|
|
11
|
-
Response as AsyncPlaywrightResponse,
|
|
12
|
-
BrowserContext as AsyncBrowserContext,
|
|
13
|
-
Playwright as AsyncPlaywright,
|
|
14
|
-
Locator as AsyncLocator,
|
|
15
11
|
Page as async_Page,
|
|
12
|
+
Locator as AsyncLocator,
|
|
13
|
+
Playwright as AsyncPlaywright,
|
|
14
|
+
BrowserContext as AsyncBrowserContext,
|
|
16
15
|
)
|
|
17
|
-
from playwright._impl._errors import Error as PlaywrightError
|
|
18
16
|
|
|
19
|
-
from ._validators import validate_fetch as _validate
|
|
17
|
+
from ._validators import validate_fetch as _validate, CamoufoxConfig
|
|
20
18
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
|
21
19
|
from scrapling.core.utils import log
|
|
22
20
|
from scrapling.core._types import (
|
|
21
|
+
Any,
|
|
23
22
|
Dict,
|
|
24
23
|
List,
|
|
25
24
|
Optional,
|
|
26
25
|
Callable,
|
|
26
|
+
TYPE_CHECKING,
|
|
27
27
|
SelectorWaitStates,
|
|
28
28
|
)
|
|
29
29
|
from scrapling.engines.toolbelt.convertor import (
|
|
@@ -33,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
|
33
33
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
34
34
|
|
|
35
35
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
|
36
|
-
_UNSET = object()
|
|
36
|
+
_UNSET: Any = object()
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class StealthySession(StealthySessionMixin, SyncSession):
|
|
@@ -101,6 +101,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
101
101
|
os_randomize: bool = False,
|
|
102
102
|
disable_ads: bool = False,
|
|
103
103
|
geoip: bool = False,
|
|
104
|
+
user_data_dir: str = "",
|
|
104
105
|
selector_config: Optional[Dict] = None,
|
|
105
106
|
additional_args: Optional[Dict] = None,
|
|
106
107
|
):
|
|
@@ -133,6 +134,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
133
134
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
|
134
135
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
135
136
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
137
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
136
138
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
137
139
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
|
138
140
|
"""
|
|
@@ -156,6 +158,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
156
158
|
block_images=block_images,
|
|
157
159
|
block_webrtc=block_webrtc,
|
|
158
160
|
os_randomize=os_randomize,
|
|
161
|
+
user_data_dir=user_data_dir,
|
|
159
162
|
wait_selector=wait_selector,
|
|
160
163
|
google_search=google_search,
|
|
161
164
|
extra_headers=extra_headers,
|
|
@@ -170,9 +173,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
170
173
|
def __create__(self):
|
|
171
174
|
"""Create a browser for this instance and context."""
|
|
172
175
|
self.playwright = sync_playwright().start()
|
|
173
|
-
self.context = self.playwright.firefox.launch_persistent_context(
|
|
174
|
-
**self.launch_options
|
|
175
|
-
)
|
|
176
|
+
self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
|
|
176
177
|
|
|
177
178
|
if self.init_script: # pragma: no cover
|
|
178
179
|
self.context.add_init_script(path=self.init_script)
|
|
@@ -180,56 +181,21 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
180
181
|
if self.cookies: # pragma: no cover
|
|
181
182
|
self.context.add_cookies(self.cookies)
|
|
182
183
|
|
|
183
|
-
def __enter__(self): # pragma: no cover
|
|
184
|
-
self.__create__()
|
|
185
|
-
return self
|
|
186
|
-
|
|
187
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
188
|
-
self.close()
|
|
189
|
-
|
|
190
|
-
def close(self): # pragma: no cover
|
|
191
|
-
"""Close all resources"""
|
|
192
|
-
if self._closed: # pragma: no cover
|
|
193
|
-
return
|
|
194
|
-
|
|
195
|
-
if self.context:
|
|
196
|
-
self.context.close()
|
|
197
|
-
self.context = None
|
|
198
|
-
|
|
199
|
-
if self.playwright:
|
|
200
|
-
self.playwright.stop()
|
|
201
|
-
self.playwright = None
|
|
202
|
-
|
|
203
|
-
self._closed = True
|
|
204
|
-
|
|
205
|
-
@staticmethod
|
|
206
|
-
def _get_page_content(page: Page) -> str | None:
|
|
207
|
-
"""
|
|
208
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
209
|
-
:param page: The page to extract content from.
|
|
210
|
-
:return:
|
|
211
|
-
"""
|
|
212
|
-
while True:
|
|
213
|
-
try:
|
|
214
|
-
return page.content() or ""
|
|
215
|
-
except PlaywrightError:
|
|
216
|
-
page.wait_for_timeout(1000)
|
|
217
|
-
continue
|
|
218
|
-
|
|
219
184
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
|
220
185
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
|
221
186
|
|
|
222
187
|
:param page: The targeted page
|
|
223
188
|
:return:
|
|
224
189
|
"""
|
|
225
|
-
|
|
190
|
+
self._wait_for_networkidle(page, timeout=5000)
|
|
191
|
+
challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
|
|
226
192
|
if not challenge_type:
|
|
227
193
|
log.error("No Cloudflare challenge found.")
|
|
228
194
|
return
|
|
229
195
|
else:
|
|
230
196
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
|
231
197
|
if challenge_type == "non-interactive":
|
|
232
|
-
while "<title>Just a moment...</title>" in (
|
|
198
|
+
while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
|
|
233
199
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
|
234
200
|
page.wait_for_timeout(1000)
|
|
235
201
|
page.wait_for_load_state()
|
|
@@ -240,31 +206,43 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
240
206
|
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
|
241
207
|
if challenge_type != "embedded":
|
|
242
208
|
box_selector = ".main-content p+div>div>div"
|
|
243
|
-
while "Verifying you are human." in
|
|
209
|
+
while "Verifying you are human." in ResponseFactory._get_page_content(page):
|
|
244
210
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
|
245
211
|
page.wait_for_timeout(500)
|
|
246
212
|
|
|
213
|
+
outer_box = {}
|
|
247
214
|
iframe = page.frame(url=__CF_PATTERN__)
|
|
248
|
-
if iframe is None:
|
|
249
|
-
|
|
250
|
-
return
|
|
215
|
+
if iframe is not None:
|
|
216
|
+
self._wait_for_page_stability(iframe, True, True)
|
|
251
217
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
218
|
+
if challenge_type != "embedded":
|
|
219
|
+
while not iframe.frame_element().is_visible():
|
|
220
|
+
# Double-checking that the iframe is loaded
|
|
221
|
+
page.wait_for_timeout(500)
|
|
222
|
+
outer_box: Any = iframe.frame_element().bounding_box()
|
|
223
|
+
|
|
224
|
+
if not iframe or not outer_box:
|
|
225
|
+
outer_box: Any = page.locator(box_selector).last.bounding_box()
|
|
256
226
|
|
|
257
|
-
iframe.wait_for_load_state(state="domcontentloaded")
|
|
258
|
-
iframe.wait_for_load_state("networkidle")
|
|
259
227
|
# Calculate the Captcha coordinates for any viewport
|
|
260
|
-
|
|
261
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
|
228
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
262
229
|
|
|
263
230
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
|
264
231
|
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
|
232
|
+
self._wait_for_networkidle(page)
|
|
233
|
+
if iframe is not None:
|
|
234
|
+
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
235
|
+
attempts = 0
|
|
236
|
+
while iframe in page.frames:
|
|
237
|
+
if attempts >= 300:
|
|
238
|
+
log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
|
|
239
|
+
break
|
|
240
|
+
page.wait_for_timeout(100)
|
|
241
|
+
attempts += 1
|
|
265
242
|
if challenge_type != "embedded":
|
|
243
|
+
page.locator(box_selector).last.wait_for(state="detached")
|
|
266
244
|
page.locator(".zone-name-title").wait_for(state="hidden")
|
|
267
|
-
|
|
245
|
+
self._wait_for_page_stability(page, True, False)
|
|
268
246
|
|
|
269
247
|
log.info("Cloudflare captcha is solved")
|
|
270
248
|
return
|
|
@@ -319,37 +297,26 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
319
297
|
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
|
320
298
|
("selector_config", selector_config, self.selector_config),
|
|
321
299
|
],
|
|
300
|
+
CamoufoxConfig,
|
|
322
301
|
_UNSET,
|
|
323
302
|
)
|
|
324
303
|
|
|
325
304
|
if self._closed: # pragma: no cover
|
|
326
305
|
raise RuntimeError("Context manager has been closed")
|
|
327
306
|
|
|
328
|
-
final_response = None
|
|
329
307
|
referer = (
|
|
330
308
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
331
309
|
)
|
|
332
310
|
|
|
333
|
-
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
334
|
-
nonlocal final_response
|
|
335
|
-
if (
|
|
336
|
-
finished_response.request.resource_type == "document"
|
|
337
|
-
and finished_response.request.is_navigation_request()
|
|
338
|
-
):
|
|
339
|
-
final_response = finished_response
|
|
340
|
-
|
|
341
311
|
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
342
|
-
|
|
312
|
+
final_response = [None]
|
|
313
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
343
314
|
|
|
344
315
|
try: # pragma: no cover
|
|
345
316
|
# Navigate to URL and wait for a specified state
|
|
346
317
|
page_info.page.on("response", handle_response)
|
|
347
318
|
first_response = page_info.page.goto(url, referer=referer)
|
|
348
|
-
|
|
349
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
350
|
-
|
|
351
|
-
if params.network_idle:
|
|
352
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
319
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
353
320
|
|
|
354
321
|
if not first_response:
|
|
355
322
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -357,11 +324,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
357
324
|
if params.solve_cloudflare:
|
|
358
325
|
self._solve_cloudflare(page_info.page)
|
|
359
326
|
# Make sure the page is fully loaded after the captcha
|
|
360
|
-
page_info.page.
|
|
361
|
-
if params.load_dom:
|
|
362
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
363
|
-
if params.network_idle:
|
|
364
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
327
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
365
328
|
|
|
366
329
|
if params.page_action:
|
|
367
330
|
try:
|
|
@@ -374,20 +337,16 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
|
374
337
|
waiter: Locator = page_info.page.locator(params.wait_selector)
|
|
375
338
|
waiter.first.wait_for(state=params.wait_selector_state)
|
|
376
339
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
377
|
-
page_info.page.
|
|
378
|
-
if params.load_dom:
|
|
379
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
380
|
-
if params.network_idle:
|
|
381
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
340
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
382
341
|
except Exception as e:
|
|
383
342
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
384
343
|
|
|
385
344
|
page_info.page.wait_for_timeout(params.wait)
|
|
386
345
|
response = ResponseFactory.from_playwright_response(
|
|
387
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
346
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
388
347
|
)
|
|
389
348
|
|
|
390
|
-
# Close the page
|
|
349
|
+
# Close the page to free up resources
|
|
391
350
|
page_info.page.close()
|
|
392
351
|
self.page_pool.pages.remove(page_info)
|
|
393
352
|
|
|
@@ -427,6 +386,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
427
386
|
os_randomize: bool = False,
|
|
428
387
|
disable_ads: bool = False,
|
|
429
388
|
geoip: bool = False,
|
|
389
|
+
user_data_dir: str = "",
|
|
430
390
|
selector_config: Optional[Dict] = None,
|
|
431
391
|
additional_args: Optional[Dict] = None,
|
|
432
392
|
):
|
|
@@ -460,6 +420,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
460
420
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
461
421
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
462
422
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
|
423
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
463
424
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
464
425
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
|
465
426
|
"""
|
|
@@ -485,6 +446,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
485
446
|
wait_selector=wait_selector,
|
|
486
447
|
google_search=google_search,
|
|
487
448
|
extra_headers=extra_headers,
|
|
449
|
+
user_data_dir=user_data_dir,
|
|
488
450
|
additional_args=additional_args,
|
|
489
451
|
selector_config=selector_config,
|
|
490
452
|
solve_cloudflare=solve_cloudflare,
|
|
@@ -504,58 +466,23 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
504
466
|
await self.context.add_init_script(path=self.init_script)
|
|
505
467
|
|
|
506
468
|
if self.cookies:
|
|
507
|
-
await self.context.add_cookies(self.cookies)
|
|
508
|
-
|
|
509
|
-
async def __aenter__(self):
|
|
510
|
-
await self.__create__()
|
|
511
|
-
return self
|
|
512
|
-
|
|
513
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
514
|
-
await self.close()
|
|
515
|
-
|
|
516
|
-
async def close(self):
|
|
517
|
-
"""Close all resources"""
|
|
518
|
-
if self._closed: # pragma: no cover
|
|
519
|
-
return
|
|
520
|
-
|
|
521
|
-
if self.context:
|
|
522
|
-
await self.context.close()
|
|
523
|
-
self.context = None
|
|
469
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
|
|
524
470
|
|
|
525
|
-
|
|
526
|
-
await self.playwright.stop()
|
|
527
|
-
self.playwright = None
|
|
528
|
-
|
|
529
|
-
self._closed = True
|
|
530
|
-
|
|
531
|
-
@staticmethod
|
|
532
|
-
async def _get_page_content(page: async_Page) -> str | None:
|
|
533
|
-
"""
|
|
534
|
-
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
535
|
-
:param page: The page to extract content from.
|
|
536
|
-
:return:
|
|
537
|
-
"""
|
|
538
|
-
while True:
|
|
539
|
-
try:
|
|
540
|
-
return (await page.content()) or ""
|
|
541
|
-
except PlaywrightError:
|
|
542
|
-
await page.wait_for_timeout(1000)
|
|
543
|
-
continue
|
|
544
|
-
|
|
545
|
-
async def _solve_cloudflare(self, page: async_Page):
|
|
471
|
+
async def _solve_cloudflare(self, page: async_Page): # pragma: no cover
|
|
546
472
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
|
547
473
|
|
|
548
474
|
:param page: The async targeted page
|
|
549
475
|
:return:
|
|
550
476
|
"""
|
|
551
|
-
|
|
477
|
+
await self._wait_for_networkidle(page, timeout=5000)
|
|
478
|
+
challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
|
|
552
479
|
if not challenge_type:
|
|
553
480
|
log.error("No Cloudflare challenge found.")
|
|
554
481
|
return
|
|
555
482
|
else:
|
|
556
483
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
|
557
484
|
if challenge_type == "non-interactive": # pragma: no cover
|
|
558
|
-
while "<title>Just a moment...</title>" in (await
|
|
485
|
+
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
|
559
486
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
|
560
487
|
await page.wait_for_timeout(1000)
|
|
561
488
|
await page.wait_for_load_state()
|
|
@@ -566,31 +493,43 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
566
493
|
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
|
567
494
|
if challenge_type != "embedded":
|
|
568
495
|
box_selector = ".main-content p+div>div>div"
|
|
569
|
-
while "Verifying you are human." in (await
|
|
496
|
+
while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
|
|
570
497
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
|
571
498
|
await page.wait_for_timeout(500)
|
|
572
499
|
|
|
500
|
+
outer_box = {}
|
|
573
501
|
iframe = page.frame(url=__CF_PATTERN__)
|
|
574
|
-
if iframe is None:
|
|
575
|
-
|
|
576
|
-
return
|
|
502
|
+
if iframe is not None:
|
|
503
|
+
await self._wait_for_page_stability(iframe, True, True)
|
|
577
504
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
505
|
+
if challenge_type != "embedded":
|
|
506
|
+
while not await (await iframe.frame_element()).is_visible():
|
|
507
|
+
# Double-checking that the iframe is loaded
|
|
508
|
+
await page.wait_for_timeout(500)
|
|
509
|
+
outer_box: Any = await (await iframe.frame_element()).bounding_box()
|
|
510
|
+
|
|
511
|
+
if not iframe or not outer_box:
|
|
512
|
+
outer_box: Any = await page.locator(box_selector).last.bounding_box()
|
|
582
513
|
|
|
583
|
-
await iframe.wait_for_load_state(state="domcontentloaded")
|
|
584
|
-
await iframe.wait_for_load_state("networkidle")
|
|
585
514
|
# Calculate the Captcha coordinates for any viewport
|
|
586
|
-
|
|
587
|
-
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
|
515
|
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
588
516
|
|
|
589
517
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
|
590
518
|
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
|
519
|
+
await self._wait_for_networkidle(page)
|
|
520
|
+
if iframe is not None:
|
|
521
|
+
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
522
|
+
attempts = 0
|
|
523
|
+
while iframe in page.frames:
|
|
524
|
+
if attempts >= 300:
|
|
525
|
+
log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
|
|
526
|
+
break
|
|
527
|
+
await page.wait_for_timeout(100)
|
|
528
|
+
attempts += 1
|
|
591
529
|
if challenge_type != "embedded":
|
|
530
|
+
await page.locator(box_selector).wait_for(state="detached")
|
|
592
531
|
await page.locator(".zone-name-title").wait_for(state="hidden")
|
|
593
|
-
await
|
|
532
|
+
await self._wait_for_page_stability(page, True, False)
|
|
594
533
|
|
|
595
534
|
log.info("Cloudflare captcha is solved")
|
|
596
535
|
return
|
|
@@ -645,37 +584,30 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
645
584
|
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
|
646
585
|
("selector_config", selector_config, self.selector_config),
|
|
647
586
|
],
|
|
587
|
+
CamoufoxConfig,
|
|
648
588
|
_UNSET,
|
|
649
589
|
)
|
|
650
590
|
|
|
651
591
|
if self._closed: # pragma: no cover
|
|
652
592
|
raise RuntimeError("Context manager has been closed")
|
|
653
593
|
|
|
654
|
-
final_response = None
|
|
655
594
|
referer = (
|
|
656
595
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
657
596
|
)
|
|
658
597
|
|
|
659
|
-
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
660
|
-
nonlocal final_response
|
|
661
|
-
if (
|
|
662
|
-
finished_response.request.resource_type == "document"
|
|
663
|
-
and finished_response.request.is_navigation_request()
|
|
664
|
-
):
|
|
665
|
-
final_response = finished_response
|
|
666
|
-
|
|
667
598
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
668
|
-
|
|
599
|
+
final_response = [None]
|
|
600
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
601
|
+
|
|
602
|
+
if TYPE_CHECKING:
|
|
603
|
+
if not isinstance(page_info.page, async_Page):
|
|
604
|
+
raise TypeError
|
|
669
605
|
|
|
670
606
|
try:
|
|
671
607
|
# Navigate to URL and wait for a specified state
|
|
672
608
|
page_info.page.on("response", handle_response)
|
|
673
609
|
first_response = await page_info.page.goto(url, referer=referer)
|
|
674
|
-
|
|
675
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
676
|
-
|
|
677
|
-
if params.network_idle:
|
|
678
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
610
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
679
611
|
|
|
680
612
|
if not first_response:
|
|
681
613
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -683,11 +615,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
683
615
|
if params.solve_cloudflare:
|
|
684
616
|
await self._solve_cloudflare(page_info.page)
|
|
685
617
|
# Make sure the page is fully loaded after the captcha
|
|
686
|
-
await page_info.page.
|
|
687
|
-
if params.load_dom:
|
|
688
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
689
|
-
if params.network_idle:
|
|
690
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
618
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
691
619
|
|
|
692
620
|
if params.page_action:
|
|
693
621
|
try:
|
|
@@ -700,11 +628,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
700
628
|
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
|
701
629
|
await waiter.first.wait_for(state=params.wait_selector_state)
|
|
702
630
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
703
|
-
await page_info.page.
|
|
704
|
-
if params.load_dom:
|
|
705
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
706
|
-
if params.network_idle:
|
|
707
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
631
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
708
632
|
except Exception as e:
|
|
709
633
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
710
634
|
|
|
@@ -712,10 +636,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
|
712
636
|
|
|
713
637
|
# Create response object
|
|
714
638
|
response = await ResponseFactory.from_async_playwright_response(
|
|
715
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
639
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
716
640
|
)
|
|
717
641
|
|
|
718
|
-
# Close the page
|
|
642
|
+
# Close the page to free up resources
|
|
719
643
|
await page_info.page.close()
|
|
720
644
|
self.page_pool.pages.remove(page_info)
|
|
721
645
|
|
|
@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
|
|
|
62
62
|
@lru_cache(2, typed=True)
|
|
63
63
|
def _launch_kwargs(
|
|
64
64
|
headless,
|
|
65
|
-
proxy,
|
|
65
|
+
proxy: Tuple,
|
|
66
66
|
locale,
|
|
67
67
|
extra_headers,
|
|
68
68
|
useragent,
|
|
@@ -70,12 +70,17 @@ def _launch_kwargs(
|
|
|
70
70
|
stealth,
|
|
71
71
|
hide_canvas,
|
|
72
72
|
disable_webgl,
|
|
73
|
+
extra_flags: Tuple,
|
|
73
74
|
) -> Tuple:
|
|
74
75
|
"""Creates the arguments we will use while launching playwright's browser"""
|
|
76
|
+
base_args = DEFAULT_FLAGS
|
|
77
|
+
if extra_flags:
|
|
78
|
+
base_args = base_args + extra_flags
|
|
79
|
+
|
|
75
80
|
launch_kwargs = {
|
|
76
81
|
"locale": locale,
|
|
77
82
|
"headless": headless,
|
|
78
|
-
"args":
|
|
83
|
+
"args": base_args,
|
|
79
84
|
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
|
80
85
|
"proxy": proxy or tuple(),
|
|
81
86
|
"device_scale_factor": 2,
|
|
@@ -85,9 +90,10 @@ def _launch_kwargs(
|
|
|
85
90
|
"user_agent": useragent or __default_useragent__,
|
|
86
91
|
}
|
|
87
92
|
if stealth:
|
|
93
|
+
stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
|
|
88
94
|
launch_kwargs.update(
|
|
89
95
|
{
|
|
90
|
-
"args":
|
|
96
|
+
"args": stealth_args,
|
|
91
97
|
"chromium_sandbox": True,
|
|
92
98
|
"is_mobile": False,
|
|
93
99
|
"has_touch": False,
|