scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +29 -19
  2. scrapling/cli.py +21 -4
  3. scrapling/core/_types.py +3 -2
  4. scrapling/core/ai.py +24 -15
  5. scrapling/core/custom_types.py +20 -27
  6. scrapling/core/mixins.py +15 -9
  7. scrapling/core/shell.py +6 -4
  8. scrapling/core/storage.py +7 -6
  9. scrapling/core/translator.py +13 -8
  10. scrapling/core/utils/__init__.py +0 -1
  11. scrapling/engines/_browsers/__init__.py +0 -2
  12. scrapling/engines/_browsers/_base.py +45 -21
  13. scrapling/engines/_browsers/_camoufox.py +98 -43
  14. scrapling/engines/_browsers/_config_tools.py +1 -1
  15. scrapling/engines/_browsers/_controllers.py +34 -13
  16. scrapling/engines/_browsers/_validators.py +31 -10
  17. scrapling/engines/constants.py +0 -15
  18. scrapling/engines/static.py +749 -336
  19. scrapling/engines/toolbelt/convertor.py +13 -15
  20. scrapling/engines/toolbelt/custom.py +6 -9
  21. scrapling/engines/toolbelt/fingerprints.py +17 -10
  22. scrapling/engines/toolbelt/navigation.py +11 -3
  23. scrapling/fetchers/__init__.py +46 -0
  24. scrapling/fetchers/chrome.py +210 -0
  25. scrapling/fetchers/firefox.py +212 -0
  26. scrapling/fetchers/requests.py +28 -0
  27. scrapling/parser.py +109 -84
  28. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
  29. scrapling-0.3.7.dist-info/RECORD +47 -0
  30. scrapling/fetchers.py +0 -444
  31. scrapling-0.3.5.dist-info/RECORD +0 -44
  32. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -7,22 +7,16 @@ from playwright.async_api import (
7
7
  BrowserContext as AsyncBrowserContext,
8
8
  Playwright as AsyncPlaywright,
9
9
  )
10
- from camoufox.utils import (
11
- launch_options as generate_launch_options,
12
- installed_verstr as camoufox_version,
13
- )
10
+ from camoufox.pkgman import installed_verstr as camoufox_version
11
+ from camoufox.utils import launch_options as generate_launch_options
14
12
 
15
- from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
16
- from scrapling.core._types import (
17
- Any,
18
- Dict,
19
- Optional,
20
- )
21
13
  from ._page import PageInfo, PagePool
22
- from ._config_tools import _compiled_stealth_scripts
23
- from ._config_tools import _launch_kwargs, _context_kwargs
14
+ from scrapling.parser import Selector
15
+ from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
24
16
  from scrapling.engines.toolbelt.fingerprints import get_os_name
25
17
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
18
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
19
+ from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
26
20
 
27
21
  __ff_version_str__ = camoufox_version().split(".", 1)[0]
28
22
 
@@ -45,6 +39,7 @@ class SyncSession:
45
39
  """Get a new page to use"""
46
40
 
47
41
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
42
+ assert self.context is not None, "Browser context not initialized"
48
43
  page = self.context.new_page()
49
44
  page.set_default_navigation_timeout(timeout)
50
45
  page.set_default_timeout(timeout)
@@ -69,11 +64,14 @@ class SyncSession:
69
64
  }
70
65
 
71
66
 
72
- class AsyncSession(SyncSession):
67
+ class AsyncSession:
73
68
  def __init__(self, max_pages: int = 1):
74
- super().__init__(max_pages)
69
+ self.max_pages = max_pages
70
+ self.page_pool = PagePool(max_pages)
71
+ self._max_wait_for_page = 60
75
72
  self.playwright: Optional[AsyncPlaywright] = None
76
73
  self.context: Optional[AsyncBrowserContext] = None
74
+ self._closed = False
77
75
  self._lock = Lock()
78
76
 
79
77
  async def _get_page(
@@ -83,6 +81,9 @@ class AsyncSession(SyncSession):
83
81
  disable_resources: bool,
84
82
  ) -> PageInfo: # pragma: no cover
85
83
  """Get a new page to use"""
84
+ if TYPE_CHECKING:
85
+ assert self.context is not None, "Browser context not initialized"
86
+
86
87
  async with self._lock:
87
88
  # If we're at max capacity after cleanup, wait for busy pages to finish
88
89
  if self.page_pool.pages_count >= self.max_pages:
@@ -96,6 +97,7 @@ class AsyncSession(SyncSession):
96
97
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
97
98
  )
98
99
 
100
+ assert self.context is not None, "Browser context not initialized"
99
101
  page = await self.context.new_page()
100
102
  page.set_default_navigation_timeout(timeout)
101
103
  page.set_default_timeout(timeout)
@@ -111,6 +113,14 @@ class AsyncSession(SyncSession):
111
113
 
112
114
  return self.page_pool.add_page(page)
113
115
 
116
+ def get_pool_stats(self) -> Dict[str, int]:
117
+ """Get statistics about the current page pool"""
118
+ return {
119
+ "total_pages": self.page_pool.pages_count,
120
+ "busy_pages": self.page_pool.busy_count,
121
+ "max_pages": self.max_pages,
122
+ }
123
+
114
124
 
115
125
  class DynamicSessionMixin:
116
126
  def __validate__(self, **params):
@@ -138,11 +148,16 @@ class DynamicSessionMixin:
138
148
  self.init_script = config.init_script
139
149
  self.wait_selector_state = config.wait_selector_state
140
150
  self.selector_config = config.selector_config
151
+ self.additional_args = config.additional_args
141
152
  self.page_action = config.page_action
142
- self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
153
+ self.user_data_dir = config.user_data_dir
154
+ self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
143
155
  self.__initiate_browser_options__()
144
156
 
145
157
  def __initiate_browser_options__(self):
158
+ if TYPE_CHECKING:
159
+ assert isinstance(self.proxy, tuple)
160
+
146
161
  if not self.cdp_url:
147
162
  # `launch_options` is used with persistent context
148
163
  self.launch_options = dict(
@@ -160,6 +175,8 @@ class DynamicSessionMixin:
160
175
  )
161
176
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
162
177
  self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
178
+ self.launch_options["user_data_dir"] = self.user_data_dir
179
+ self.launch_options.update(cast(Dict, self.additional_args))
163
180
  self.context_options = dict()
164
181
  else:
165
182
  # while `context_options` is left to be used when cdp mode is enabled
@@ -175,11 +192,12 @@ class DynamicSessionMixin:
175
192
  )
176
193
  self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
177
194
  self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
195
+ self.context_options.update(cast(Dict, self.additional_args))
178
196
 
179
197
 
180
198
  class StealthySessionMixin:
181
199
  def __validate__(self, **params):
182
- config = validate(params, model=CamoufoxConfig)
200
+ config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
183
201
 
184
202
  self.max_pages = config.max_pages
185
203
  self.headless = config.headless
@@ -208,15 +226,16 @@ class StealthySessionMixin:
208
226
  self.selector_config = config.selector_config
209
227
  self.additional_args = config.additional_args
210
228
  self.page_action = config.page_action
211
- self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
229
+ self.user_data_dir = config.user_data_dir
230
+ self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
212
231
  self.__initiate_browser_options__()
213
232
 
214
233
  def __initiate_browser_options__(self):
215
234
  """Initiate browser options."""
216
- self.launch_options = generate_launch_options(
235
+ self.launch_options: Dict[str, Any] = generate_launch_options(
217
236
  **{
218
237
  "geoip": self.geoip,
219
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
238
+ "proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
220
239
  "addons": self.addons,
221
240
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
222
241
  "headless": self.headless,
@@ -226,7 +245,7 @@ class StealthySessionMixin:
226
245
  "block_webrtc": self.block_webrtc,
227
246
  "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
228
247
  "os": None if self.os_randomize else get_os_name(),
229
- "user_data_dir": "",
248
+ "user_data_dir": self.user_data_dir,
230
249
  "ff_version": __ff_version_str__,
231
250
  "firefox_user_prefs": {
232
251
  # This is what enabling `enable_cache` does internally, so we do it from here instead
@@ -236,7 +255,7 @@ class StealthySessionMixin:
236
255
  "browser.cache.disk_cache_ssl": True,
237
256
  "browser.cache.disk.smart_size.enabled": True,
238
257
  },
239
- **self.additional_args,
258
+ **cast(Dict, self.additional_args),
240
259
  }
241
260
  )
242
261
 
@@ -268,4 +287,9 @@ class StealthySessionMixin:
268
287
  if f"cType: '{ctype}'" in page_content:
269
288
  return ctype
270
289
 
290
+ # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
291
+ selector = Selector(content=page_content)
292
+ if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
293
+ return "embedded"
294
+
271
295
  return None
@@ -1,3 +1,4 @@
1
+ from random import randint
1
2
  from re import compile as re_compile
2
3
 
3
4
  from playwright.sync_api import (
@@ -20,10 +21,12 @@ from ._validators import validate_fetch as _validate
20
21
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
21
22
  from scrapling.core.utils import log
22
23
  from scrapling.core._types import (
24
+ Any,
23
25
  Dict,
24
26
  List,
25
27
  Optional,
26
28
  Callable,
29
+ TYPE_CHECKING,
27
30
  SelectorWaitStates,
28
31
  )
29
32
  from scrapling.engines.toolbelt.convertor import (
@@ -33,7 +36,7 @@ from scrapling.engines.toolbelt.convertor import (
33
36
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
34
37
 
35
38
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
36
- _UNSET = object()
39
+ _UNSET: Any = object()
37
40
 
38
41
 
39
42
  class StealthySession(StealthySessionMixin, SyncSession):
@@ -101,6 +104,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
101
104
  os_randomize: bool = False,
102
105
  disable_ads: bool = False,
103
106
  geoip: bool = False,
107
+ user_data_dir: str = "",
104
108
  selector_config: Optional[Dict] = None,
105
109
  additional_args: Optional[Dict] = None,
106
110
  ):
@@ -116,7 +120,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
116
120
  :param cookies: Set cookies for the next request.
117
121
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
118
122
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
119
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
123
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
120
124
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
121
125
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
122
126
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
@@ -133,6 +137,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
133
137
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
134
138
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
135
139
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
140
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
136
141
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
137
142
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
138
143
  """
@@ -156,6 +161,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
156
161
  block_images=block_images,
157
162
  block_webrtc=block_webrtc,
158
163
  os_randomize=os_randomize,
164
+ user_data_dir=user_data_dir,
159
165
  wait_selector=wait_selector,
160
166
  google_search=google_search,
161
167
  extra_headers=extra_headers,
@@ -170,9 +176,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
170
176
  def __create__(self):
171
177
  """Create a browser for this instance and context."""
172
178
  self.playwright = sync_playwright().start()
173
- self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
174
- **self.launch_options
175
- )
179
+ self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
176
180
 
177
181
  if self.init_script: # pragma: no cover
178
182
  self.context.add_init_script(path=self.init_script)
@@ -203,9 +207,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
203
207
  self._closed = True
204
208
 
205
209
  @staticmethod
206
- def _get_page_content(page: Page) -> str | None:
210
+ def _get_page_content(page: Page) -> str:
207
211
  """
208
- A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
212
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
209
213
  :param page: The page to extract content from.
210
214
  :return:
211
215
  """
@@ -215,6 +219,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
215
219
  except PlaywrightError:
216
220
  page.wait_for_timeout(1000)
217
221
  continue
222
+ return "" # pyright: ignore
218
223
 
219
224
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
220
225
  """Solve the cloudflare challenge displayed on the playwright page passed
@@ -222,6 +227,10 @@ class StealthySession(StealthySessionMixin, SyncSession):
222
227
  :param page: The targeted page
223
228
  :return:
224
229
  """
230
+ try:
231
+ page.wait_for_load_state("networkidle", timeout=5000)
232
+ except PlaywrightError:
233
+ pass
225
234
  challenge_type = self._detect_cloudflare(self._get_page_content(page))
226
235
  if not challenge_type:
227
236
  log.error("No Cloudflare challenge found.")
@@ -237,26 +246,42 @@ class StealthySession(StealthySessionMixin, SyncSession):
237
246
  return
238
247
 
239
248
  else:
240
- while "Verifying you are human." in self._get_page_content(page):
241
- # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
242
- page.wait_for_timeout(500)
243
-
249
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
250
+ if challenge_type != "embedded":
251
+ box_selector = ".main-content p+div>div>div"
252
+ while "Verifying you are human." in self._get_page_content(page):
253
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
254
+ page.wait_for_timeout(500)
255
+
256
+ outer_box = {}
244
257
  iframe = page.frame(url=__CF_PATTERN__)
245
- if iframe is None:
246
- log.info("Didn't find Cloudflare iframe!")
247
- return
258
+ if iframe is not None:
259
+ iframe.wait_for_load_state(state="domcontentloaded")
260
+ iframe.wait_for_load_state("networkidle")
261
+
262
+ if challenge_type != "embedded":
263
+ while not iframe.frame_element().is_visible():
264
+ # Double-checking that the iframe is loaded
265
+ page.wait_for_timeout(500)
266
+ outer_box: Any = iframe.frame_element().bounding_box()
248
267
 
249
- while not iframe.frame_element().is_visible():
250
- # Double-checking that the iframe is loaded
251
- page.wait_for_timeout(500)
268
+ if not iframe or not outer_box:
269
+ outer_box: Any = page.locator(box_selector).last.bounding_box()
252
270
 
253
271
  # Calculate the Captcha coordinates for any viewport
254
- outer_box = page.locator(".main-content p+div>div>div").bounding_box()
255
- captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
272
+ captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
256
273
 
257
274
  # Move the mouse to the center of the window, then press and hold the left mouse button
258
275
  page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
259
- page.locator(".zone-name-title").wait_for(state="hidden")
276
+ page.wait_for_load_state("networkidle")
277
+ if iframe is not None:
278
+ # Wait for the frame to be removed from the page
279
+ while iframe in page.frames:
280
+ page.wait_for_timeout(100)
281
+ if challenge_type != "embedded":
282
+ page.locator(box_selector).last.wait_for(state="detached")
283
+ page.locator(".zone-name-title").wait_for(state="hidden")
284
+ page.wait_for_load_state(state="load")
260
285
  page.wait_for_load_state(state="domcontentloaded")
261
286
 
262
287
  log.info("Cloudflare captcha is solved")
@@ -293,7 +318,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
293
318
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
294
319
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
295
320
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
296
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
321
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
297
322
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
298
323
  :return: A `Response` object.
299
324
  """
@@ -328,6 +353,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
328
353
  if (
329
354
  finished_response.request.resource_type == "document"
330
355
  and finished_response.request.is_navigation_request()
356
+ and finished_response.request.frame == page_info.page.main_frame
331
357
  ):
332
358
  final_response = finished_response
333
359
 
@@ -380,7 +406,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
380
406
  page_info.page, first_response, final_response, params.selector_config
381
407
  )
382
408
 
383
- # Close the page, to free up resources
409
+ # Close the page to free up resources
384
410
  page_info.page.close()
385
411
  self.page_pool.pages.remove(page_info)
386
412
 
@@ -420,6 +446,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
420
446
  os_randomize: bool = False,
421
447
  disable_ads: bool = False,
422
448
  geoip: bool = False,
449
+ user_data_dir: str = "",
423
450
  selector_config: Optional[Dict] = None,
424
451
  additional_args: Optional[Dict] = None,
425
452
  ):
@@ -435,7 +462,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
435
462
  :param cookies: Set cookies for the next request.
436
463
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
437
464
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
438
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
465
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
439
466
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
440
467
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
441
468
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
@@ -453,6 +480,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
453
480
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
454
481
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
455
482
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
483
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
456
484
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
457
485
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
458
486
  """
@@ -478,6 +506,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
478
506
  wait_selector=wait_selector,
479
507
  google_search=google_search,
480
508
  extra_headers=extra_headers,
509
+ user_data_dir=user_data_dir,
481
510
  additional_args=additional_args,
482
511
  selector_config=selector_config,
483
512
  solve_cloudflare=solve_cloudflare,
@@ -497,7 +526,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
497
526
  await self.context.add_init_script(path=self.init_script)
498
527
 
499
528
  if self.cookies:
500
- await self.context.add_cookies(self.cookies)
529
+ await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
501
530
 
502
531
  async def __aenter__(self):
503
532
  await self.__create__()
@@ -513,18 +542,18 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
513
542
 
514
543
  if self.context:
515
544
  await self.context.close()
516
- self.context = None
545
+ self.context = None # pyright: ignore
517
546
 
518
547
  if self.playwright:
519
548
  await self.playwright.stop()
520
- self.playwright = None
549
+ self.playwright = None # pyright: ignore
521
550
 
522
551
  self._closed = True
523
552
 
524
553
  @staticmethod
525
- async def _get_page_content(page: async_Page) -> str | None:
554
+ async def _get_page_content(page: async_Page) -> str:
526
555
  """
527
- A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
556
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
528
557
  :param page: The page to extract content from.
529
558
  :return:
530
559
  """
@@ -534,6 +563,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
534
563
  except PlaywrightError:
535
564
  await page.wait_for_timeout(1000)
536
565
  continue
566
+ return "" # pyright: ignore
537
567
 
538
568
  async def _solve_cloudflare(self, page: async_Page):
539
569
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
@@ -541,6 +571,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
541
571
  :param page: The async targeted page
542
572
  :return:
543
573
  """
574
+ try:
575
+ await page.wait_for_load_state("networkidle", timeout=5000)
576
+ except PlaywrightError:
577
+ pass
544
578
  challenge_type = self._detect_cloudflare(await self._get_page_content(page))
545
579
  if not challenge_type:
546
580
  log.error("No Cloudflare challenge found.")
@@ -556,26 +590,42 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
556
590
  return
557
591
 
558
592
  else:
559
- while "Verifying you are human." in (await self._get_page_content(page)):
560
- # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
561
- await page.wait_for_timeout(500)
562
-
593
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
594
+ if challenge_type != "embedded":
595
+ box_selector = ".main-content p+div>div>div"
596
+ while "Verifying you are human." in (await self._get_page_content(page)):
597
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
598
+ await page.wait_for_timeout(500)
599
+
600
+ outer_box = {}
563
601
  iframe = page.frame(url=__CF_PATTERN__)
564
- if iframe is None:
565
- log.info("Didn't find Cloudflare iframe!")
566
- return
602
+ if iframe is not None:
603
+ await iframe.wait_for_load_state(state="domcontentloaded")
604
+ await iframe.wait_for_load_state("networkidle")
567
605
 
568
- while not await (await iframe.frame_element()).is_visible():
569
- # Double-checking that the iframe is loaded
570
- await page.wait_for_timeout(500)
606
+ if challenge_type != "embedded":
607
+ while not await (await iframe.frame_element()).is_visible():
608
+ # Double-checking that the iframe is loaded
609
+ await page.wait_for_timeout(500)
610
+ outer_box: Any = await (await iframe.frame_element()).bounding_box()
611
+
612
+ if not iframe or not outer_box:
613
+ outer_box: Any = await page.locator(box_selector).last.bounding_box()
571
614
 
572
615
  # Calculate the Captcha coordinates for any viewport
573
- outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
574
- captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
616
+ captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
575
617
 
576
618
  # Move the mouse to the center of the window, then press and hold the left mouse button
577
619
  await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
578
- await page.locator(".zone-name-title").wait_for(state="hidden")
620
+ await page.wait_for_load_state("networkidle")
621
+ if iframe is not None:
622
+ # Wait for the frame to be removed from the page
623
+ while iframe in page.frames:
624
+ await page.wait_for_timeout(100)
625
+ if challenge_type != "embedded":
626
+ await page.locator(box_selector).wait_for(state="detached")
627
+ await page.locator(".zone-name-title").wait_for(state="hidden")
628
+ await page.wait_for_load_state(state="load")
579
629
  await page.wait_for_load_state(state="domcontentloaded")
580
630
 
581
631
  log.info("Cloudflare captcha is solved")
@@ -612,7 +662,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
612
662
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
613
663
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
614
664
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
615
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
665
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
616
666
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
617
667
  :return: A `Response` object.
618
668
  """
@@ -647,12 +697,17 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
647
697
  if (
648
698
  finished_response.request.resource_type == "document"
649
699
  and finished_response.request.is_navigation_request()
700
+ and finished_response.request.frame == page_info.page.main_frame
650
701
  ):
651
702
  final_response = finished_response
652
703
 
653
704
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
654
705
  page_info.mark_busy(url=url)
655
706
 
707
+ if TYPE_CHECKING:
708
+ if not isinstance(page_info.page, async_Page):
709
+ raise TypeError
710
+
656
711
  try:
657
712
  # Navigate to URL and wait for a specified state
658
713
  page_info.page.on("response", handle_response)
@@ -701,7 +756,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
701
756
  page_info.page, first_response, final_response, params.selector_config
702
757
  )
703
758
 
704
- # Close the page, to free up resources
759
+ # Close the page to free up resources
705
760
  await page_info.page.close()
706
761
  self.page_pool.pages.remove(page_info)
707
762
 
@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
62
62
  @lru_cache(2, typed=True)
63
63
  def _launch_kwargs(
64
64
  headless,
65
- proxy,
65
+ proxy: Tuple,
66
66
  locale,
67
67
  extra_headers,
68
68
  useragent,