scrapling 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.7"
2
+ __version__ = "0.3.8"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
  from typing import Any, TYPE_CHECKING
@@ -2,17 +2,27 @@ from time import time
2
2
  from asyncio import sleep as asyncio_sleep, Lock
3
3
 
4
4
  from camoufox import DefaultAddons
5
- from playwright.sync_api import BrowserContext, Playwright
5
+ from playwright.sync_api import (
6
+ Page,
7
+ Frame,
8
+ BrowserContext,
9
+ Playwright,
10
+ Response as SyncPlaywrightResponse,
11
+ )
6
12
  from playwright.async_api import (
7
- BrowserContext as AsyncBrowserContext,
13
+ Page as AsyncPage,
14
+ Frame as AsyncFrame,
8
15
  Playwright as AsyncPlaywright,
16
+ Response as AsyncPlaywrightResponse,
17
+ BrowserContext as AsyncBrowserContext,
9
18
  )
19
+ from playwright._impl._errors import Error as PlaywrightError
10
20
  from camoufox.pkgman import installed_verstr as camoufox_version
11
21
  from camoufox.utils import launch_options as generate_launch_options
12
22
 
13
23
  from ._page import PageInfo, PagePool
14
24
  from scrapling.parser import Selector
15
- from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
25
+ from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING
16
26
  from scrapling.engines.toolbelt.fingerprints import get_os_name
17
27
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
18
28
  from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
@@ -26,10 +36,35 @@ class SyncSession:
26
36
  self.max_pages = max_pages
27
37
  self.page_pool = PagePool(max_pages)
28
38
  self._max_wait_for_page = 60
29
- self.playwright: Optional[Playwright] = None
30
- self.context: Optional[BrowserContext] = None
39
+ self.playwright: Playwright | Any = None
40
+ self.context: BrowserContext | Any = None
31
41
  self._closed = False
32
42
 
43
+ def __create__(self):
44
+ pass
45
+
46
+ def close(self): # pragma: no cover
47
+ """Close all resources"""
48
+ if self._closed:
49
+ return
50
+
51
+ if self.context:
52
+ self.context.close()
53
+ self.context = None
54
+
55
+ if self.playwright:
56
+ self.playwright.stop()
57
+ self.playwright = None # pyright: ignore
58
+
59
+ self._closed = True
60
+
61
+ def __enter__(self):
62
+ self.__create__()
63
+ return self
64
+
65
+ def __exit__(self, exc_type, exc_val, exc_tb):
66
+ self.close()
67
+
33
68
  def _get_page(
34
69
  self,
35
70
  timeout: int | float,
@@ -53,7 +88,9 @@ class SyncSession:
53
88
  for script in _compiled_stealth_scripts():
54
89
  page.add_init_script(script=script)
55
90
 
56
- return self.page_pool.add_page(page)
91
+ page_info = self.page_pool.add_page(page)
92
+ page_info.mark_busy()
93
+ return page_info
57
94
 
58
95
  def get_pool_stats(self) -> Dict[str, int]:
59
96
  """Get statistics about the current page pool"""
@@ -63,17 +100,76 @@ class SyncSession:
63
100
  "max_pages": self.max_pages,
64
101
  }
65
102
 
103
+ @staticmethod
104
+ def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
105
+ """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
106
+ try:
107
+ page.wait_for_load_state("networkidle", timeout=timeout)
108
+ except PlaywrightError:
109
+ pass
110
+
111
+ def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
112
+ page.wait_for_load_state(state="load")
113
+ if load_dom:
114
+ page.wait_for_load_state(state="domcontentloaded")
115
+ if network_idle:
116
+ self._wait_for_networkidle(page)
117
+
118
+ @staticmethod
119
+ def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
120
+ """Create a response handler that captures the final navigation response.
121
+
122
+ :param page_info: The PageInfo object containing the page
123
+ :param response_container: A list to store the final response (mutable container)
124
+ :return: A callback function for page.on("response", ...)
125
+ """
126
+
127
+ def handle_response(finished_response: SyncPlaywrightResponse):
128
+ if (
129
+ finished_response.request.resource_type == "document"
130
+ and finished_response.request.is_navigation_request()
131
+ and finished_response.request.frame == page_info.page.main_frame
132
+ ):
133
+ response_container[0] = finished_response
134
+
135
+ return handle_response
136
+
66
137
 
67
138
  class AsyncSession:
68
139
  def __init__(self, max_pages: int = 1):
69
140
  self.max_pages = max_pages
70
141
  self.page_pool = PagePool(max_pages)
71
142
  self._max_wait_for_page = 60
72
- self.playwright: Optional[AsyncPlaywright] = None
73
- self.context: Optional[AsyncBrowserContext] = None
143
+ self.playwright: AsyncPlaywright | Any = None
144
+ self.context: AsyncBrowserContext | Any = None
74
145
  self._closed = False
75
146
  self._lock = Lock()
76
147
 
148
+ async def __create__(self):
149
+ pass
150
+
151
+ async def close(self):
152
+ """Close all resources"""
153
+ if self._closed: # pragma: no cover
154
+ return
155
+
156
+ if self.context:
157
+ await self.context.close()
158
+ self.context = None # pyright: ignore
159
+
160
+ if self.playwright:
161
+ await self.playwright.stop()
162
+ self.playwright = None # pyright: ignore
163
+
164
+ self._closed = True
165
+
166
+ async def __aenter__(self):
167
+ await self.__create__()
168
+ return self
169
+
170
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
171
+ await self.close()
172
+
77
173
  async def _get_page(
78
174
  self,
79
175
  timeout: int | float,
@@ -97,7 +193,6 @@ class AsyncSession:
97
193
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
98
194
  )
99
195
 
100
- assert self.context is not None, "Browser context not initialized"
101
196
  page = await self.context.new_page()
102
197
  page.set_default_navigation_timeout(timeout)
103
198
  page.set_default_timeout(timeout)
@@ -121,6 +216,40 @@ class AsyncSession:
121
216
  "max_pages": self.max_pages,
122
217
  }
123
218
 
219
+ @staticmethod
220
+ async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
221
+ """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
222
+ try:
223
+ await page.wait_for_load_state("networkidle", timeout=timeout)
224
+ except PlaywrightError:
225
+ pass
226
+
227
+ async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
228
+ await page.wait_for_load_state(state="load")
229
+ if load_dom:
230
+ await page.wait_for_load_state(state="domcontentloaded")
231
+ if network_idle:
232
+ await self._wait_for_networkidle(page)
233
+
234
+ @staticmethod
235
+ def _create_response_handler(page_info: PageInfo, response_container: List) -> Callable:
236
+ """Create an async response handler that captures the final navigation response.
237
+
238
+ :param page_info: The PageInfo object containing the page
239
+ :param response_container: A list to store the final response (mutable container)
240
+ :return: A callback function for page.on("response", ...)
241
+ """
242
+
243
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
244
+ if (
245
+ finished_response.request.resource_type == "document"
246
+ and finished_response.request.is_navigation_request()
247
+ and finished_response.request.frame == page_info.page.main_frame
248
+ ):
249
+ response_container[0] = finished_response
250
+
251
+ return handle_response
252
+
124
253
 
125
254
  class DynamicSessionMixin:
126
255
  def __validate__(self, **params):
@@ -147,6 +276,7 @@ class DynamicSessionMixin:
147
276
  self.wait_selector = config.wait_selector
148
277
  self.init_script = config.init_script
149
278
  self.wait_selector_state = config.wait_selector_state
279
+ self.extra_flags = config.extra_flags
150
280
  self.selector_config = config.selector_config
151
281
  self.additional_args = config.additional_args
152
282
  self.page_action = config.page_action
@@ -171,6 +301,7 @@ class DynamicSessionMixin:
171
301
  self.stealth,
172
302
  self.hide_canvas,
173
303
  self.disable_webgl,
304
+ tuple(self.extra_flags) if self.extra_flags else tuple(),
174
305
  )
175
306
  )
176
307
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
@@ -2,22 +2,19 @@ from random import randint
2
2
  from re import compile as re_compile
3
3
 
4
4
  from playwright.sync_api import (
5
- Response as SyncPlaywrightResponse,
6
- sync_playwright,
7
- Locator,
8
5
  Page,
6
+ Locator,
7
+ sync_playwright,
9
8
  )
10
9
  from playwright.async_api import (
11
10
  async_playwright,
12
- Response as AsyncPlaywrightResponse,
13
- BrowserContext as AsyncBrowserContext,
14
- Playwright as AsyncPlaywright,
15
- Locator as AsyncLocator,
16
11
  Page as async_Page,
12
+ Locator as AsyncLocator,
13
+ Playwright as AsyncPlaywright,
14
+ BrowserContext as AsyncBrowserContext,
17
15
  )
18
- from playwright._impl._errors import Error as PlaywrightError
19
16
 
20
- from ._validators import validate_fetch as _validate
17
+ from ._validators import validate_fetch as _validate, CamoufoxConfig
21
18
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
22
19
  from scrapling.core.utils import log
23
20
  from scrapling.core._types import (
@@ -184,61 +181,21 @@ class StealthySession(StealthySessionMixin, SyncSession):
184
181
  if self.cookies: # pragma: no cover
185
182
  self.context.add_cookies(self.cookies)
186
183
 
187
- def __enter__(self): # pragma: no cover
188
- self.__create__()
189
- return self
190
-
191
- def __exit__(self, exc_type, exc_val, exc_tb):
192
- self.close()
193
-
194
- def close(self): # pragma: no cover
195
- """Close all resources"""
196
- if self._closed: # pragma: no cover
197
- return
198
-
199
- if self.context:
200
- self.context.close()
201
- self.context = None
202
-
203
- if self.playwright:
204
- self.playwright.stop()
205
- self.playwright = None
206
-
207
- self._closed = True
208
-
209
- @staticmethod
210
- def _get_page_content(page: Page) -> str:
211
- """
212
- A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
213
- :param page: The page to extract content from.
214
- :return:
215
- """
216
- while True:
217
- try:
218
- return page.content() or ""
219
- except PlaywrightError:
220
- page.wait_for_timeout(1000)
221
- continue
222
- return "" # pyright: ignore
223
-
224
184
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
225
185
  """Solve the cloudflare challenge displayed on the playwright page passed
226
186
 
227
187
  :param page: The targeted page
228
188
  :return:
229
189
  """
230
- try:
231
- page.wait_for_load_state("networkidle", timeout=5000)
232
- except PlaywrightError:
233
- pass
234
- challenge_type = self._detect_cloudflare(self._get_page_content(page))
190
+ self._wait_for_networkidle(page, timeout=5000)
191
+ challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
235
192
  if not challenge_type:
236
193
  log.error("No Cloudflare challenge found.")
237
194
  return
238
195
  else:
239
196
  log.info(f'The turnstile version discovered is "{challenge_type}"')
240
197
  if challenge_type == "non-interactive":
241
- while "<title>Just a moment...</title>" in (self._get_page_content(page)):
198
+ while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
242
199
  log.info("Waiting for Cloudflare wait page to disappear.")
243
200
  page.wait_for_timeout(1000)
244
201
  page.wait_for_load_state()
@@ -249,15 +206,14 @@ class StealthySession(StealthySessionMixin, SyncSession):
249
206
  box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
250
207
  if challenge_type != "embedded":
251
208
  box_selector = ".main-content p+div>div>div"
252
- while "Verifying you are human." in self._get_page_content(page):
209
+ while "Verifying you are human." in ResponseFactory._get_page_content(page):
253
210
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
254
211
  page.wait_for_timeout(500)
255
212
 
256
213
  outer_box = {}
257
214
  iframe = page.frame(url=__CF_PATTERN__)
258
215
  if iframe is not None:
259
- iframe.wait_for_load_state(state="domcontentloaded")
260
- iframe.wait_for_load_state("networkidle")
216
+ self._wait_for_page_stability(iframe, True, True)
261
217
 
262
218
  if challenge_type != "embedded":
263
219
  while not iframe.frame_element().is_visible():
@@ -273,16 +229,20 @@ class StealthySession(StealthySessionMixin, SyncSession):
273
229
 
274
230
  # Move the mouse to the center of the window, then press and hold the left mouse button
275
231
  page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
276
- page.wait_for_load_state("networkidle")
232
+ self._wait_for_networkidle(page)
277
233
  if iframe is not None:
278
- # Wait for the frame to be removed from the page
234
+ # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
235
+ attempts = 0
279
236
  while iframe in page.frames:
237
+ if attempts >= 300:
238
+ log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
239
+ break
280
240
  page.wait_for_timeout(100)
241
+ attempts += 1
281
242
  if challenge_type != "embedded":
282
243
  page.locator(box_selector).last.wait_for(state="detached")
283
244
  page.locator(".zone-name-title").wait_for(state="hidden")
284
- page.wait_for_load_state(state="load")
285
- page.wait_for_load_state(state="domcontentloaded")
245
+ self._wait_for_page_stability(page, True, False)
286
246
 
287
247
  log.info("Cloudflare captcha is solved")
288
248
  return
@@ -337,38 +297,26 @@ class StealthySession(StealthySessionMixin, SyncSession):
337
297
  ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
338
298
  ("selector_config", selector_config, self.selector_config),
339
299
  ],
300
+ CamoufoxConfig,
340
301
  _UNSET,
341
302
  )
342
303
 
343
304
  if self._closed: # pragma: no cover
344
305
  raise RuntimeError("Context manager has been closed")
345
306
 
346
- final_response = None
347
307
  referer = (
348
308
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
349
309
  )
350
310
 
351
- def handle_response(finished_response: SyncPlaywrightResponse):
352
- nonlocal final_response
353
- if (
354
- finished_response.request.resource_type == "document"
355
- and finished_response.request.is_navigation_request()
356
- and finished_response.request.frame == page_info.page.main_frame
357
- ):
358
- final_response = finished_response
359
-
360
311
  page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
361
- page_info.mark_busy(url=url)
312
+ final_response = [None]
313
+ handle_response = self._create_response_handler(page_info, final_response)
362
314
 
363
315
  try: # pragma: no cover
364
316
  # Navigate to URL and wait for a specified state
365
317
  page_info.page.on("response", handle_response)
366
318
  first_response = page_info.page.goto(url, referer=referer)
367
- if params.load_dom:
368
- page_info.page.wait_for_load_state(state="domcontentloaded")
369
-
370
- if params.network_idle:
371
- page_info.page.wait_for_load_state("networkidle")
319
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
372
320
 
373
321
  if not first_response:
374
322
  raise RuntimeError(f"Failed to get response for {url}")
@@ -376,11 +324,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
376
324
  if params.solve_cloudflare:
377
325
  self._solve_cloudflare(page_info.page)
378
326
  # Make sure the page is fully loaded after the captcha
379
- page_info.page.wait_for_load_state(state="load")
380
- if params.load_dom:
381
- page_info.page.wait_for_load_state(state="domcontentloaded")
382
- if params.network_idle:
383
- page_info.page.wait_for_load_state("networkidle")
327
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
384
328
 
385
329
  if params.page_action:
386
330
  try:
@@ -393,17 +337,13 @@ class StealthySession(StealthySessionMixin, SyncSession):
393
337
  waiter: Locator = page_info.page.locator(params.wait_selector)
394
338
  waiter.first.wait_for(state=params.wait_selector_state)
395
339
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
396
- page_info.page.wait_for_load_state(state="load")
397
- if params.load_dom:
398
- page_info.page.wait_for_load_state(state="domcontentloaded")
399
- if params.network_idle:
400
- page_info.page.wait_for_load_state("networkidle")
340
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
401
341
  except Exception as e:
402
342
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
403
343
 
404
344
  page_info.page.wait_for_timeout(params.wait)
405
345
  response = ResponseFactory.from_playwright_response(
406
- page_info.page, first_response, final_response, params.selector_config
346
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
407
347
  )
408
348
 
409
349
  # Close the page to free up resources
@@ -528,61 +468,21 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
528
468
  if self.cookies:
529
469
  await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
530
470
 
531
- async def __aenter__(self):
532
- await self.__create__()
533
- return self
534
-
535
- async def __aexit__(self, exc_type, exc_val, exc_tb):
536
- await self.close()
537
-
538
- async def close(self):
539
- """Close all resources"""
540
- if self._closed: # pragma: no cover
541
- return
542
-
543
- if self.context:
544
- await self.context.close()
545
- self.context = None # pyright: ignore
546
-
547
- if self.playwright:
548
- await self.playwright.stop()
549
- self.playwright = None # pyright: ignore
550
-
551
- self._closed = True
552
-
553
- @staticmethod
554
- async def _get_page_content(page: async_Page) -> str:
555
- """
556
- A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
557
- :param page: The page to extract content from.
558
- :return:
559
- """
560
- while True:
561
- try:
562
- return (await page.content()) or ""
563
- except PlaywrightError:
564
- await page.wait_for_timeout(1000)
565
- continue
566
- return "" # pyright: ignore
567
-
568
- async def _solve_cloudflare(self, page: async_Page):
471
+ async def _solve_cloudflare(self, page: async_Page): # pragma: no cover
569
472
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
570
473
 
571
474
  :param page: The async targeted page
572
475
  :return:
573
476
  """
574
- try:
575
- await page.wait_for_load_state("networkidle", timeout=5000)
576
- except PlaywrightError:
577
- pass
578
- challenge_type = self._detect_cloudflare(await self._get_page_content(page))
477
+ await self._wait_for_networkidle(page, timeout=5000)
478
+ challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
579
479
  if not challenge_type:
580
480
  log.error("No Cloudflare challenge found.")
581
481
  return
582
482
  else:
583
483
  log.info(f'The turnstile version discovered is "{challenge_type}"')
584
484
  if challenge_type == "non-interactive": # pragma: no cover
585
- while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
485
+ while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
586
486
  log.info("Waiting for Cloudflare wait page to disappear.")
587
487
  await page.wait_for_timeout(1000)
588
488
  await page.wait_for_load_state()
@@ -593,15 +493,14 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
593
493
  box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
594
494
  if challenge_type != "embedded":
595
495
  box_selector = ".main-content p+div>div>div"
596
- while "Verifying you are human." in (await self._get_page_content(page)):
496
+ while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
597
497
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
598
498
  await page.wait_for_timeout(500)
599
499
 
600
500
  outer_box = {}
601
501
  iframe = page.frame(url=__CF_PATTERN__)
602
502
  if iframe is not None:
603
- await iframe.wait_for_load_state(state="domcontentloaded")
604
- await iframe.wait_for_load_state("networkidle")
503
+ await self._wait_for_page_stability(iframe, True, True)
605
504
 
606
505
  if challenge_type != "embedded":
607
506
  while not await (await iframe.frame_element()).is_visible():
@@ -617,16 +516,20 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
617
516
 
618
517
  # Move the mouse to the center of the window, then press and hold the left mouse button
619
518
  await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
620
- await page.wait_for_load_state("networkidle")
519
+ await self._wait_for_networkidle(page)
621
520
  if iframe is not None:
622
- # Wait for the frame to be removed from the page
521
+ # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
522
+ attempts = 0
623
523
  while iframe in page.frames:
524
+ if attempts >= 300:
525
+ log.info("Cloudflare iframe didn't disappear after 30s, continuing...")
526
+ break
624
527
  await page.wait_for_timeout(100)
528
+ attempts += 1
625
529
  if challenge_type != "embedded":
626
530
  await page.locator(box_selector).wait_for(state="detached")
627
531
  await page.locator(".zone-name-title").wait_for(state="hidden")
628
- await page.wait_for_load_state(state="load")
629
- await page.wait_for_load_state(state="domcontentloaded")
532
+ await self._wait_for_page_stability(page, True, False)
630
533
 
631
534
  log.info("Cloudflare captcha is solved")
632
535
  return
@@ -681,28 +584,20 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
681
584
  ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
682
585
  ("selector_config", selector_config, self.selector_config),
683
586
  ],
587
+ CamoufoxConfig,
684
588
  _UNSET,
685
589
  )
686
590
 
687
591
  if self._closed: # pragma: no cover
688
592
  raise RuntimeError("Context manager has been closed")
689
593
 
690
- final_response = None
691
594
  referer = (
692
595
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
693
596
  )
694
597
 
695
- async def handle_response(finished_response: AsyncPlaywrightResponse):
696
- nonlocal final_response
697
- if (
698
- finished_response.request.resource_type == "document"
699
- and finished_response.request.is_navigation_request()
700
- and finished_response.request.frame == page_info.page.main_frame
701
- ):
702
- final_response = finished_response
703
-
704
598
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
705
- page_info.mark_busy(url=url)
599
+ final_response = [None]
600
+ handle_response = self._create_response_handler(page_info, final_response)
706
601
 
707
602
  if TYPE_CHECKING:
708
603
  if not isinstance(page_info.page, async_Page):
@@ -712,11 +607,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
712
607
  # Navigate to URL and wait for a specified state
713
608
  page_info.page.on("response", handle_response)
714
609
  first_response = await page_info.page.goto(url, referer=referer)
715
- if params.load_dom:
716
- await page_info.page.wait_for_load_state(state="domcontentloaded")
717
-
718
- if params.network_idle:
719
- await page_info.page.wait_for_load_state("networkidle")
610
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
720
611
 
721
612
  if not first_response:
722
613
  raise RuntimeError(f"Failed to get response for {url}")
@@ -724,11 +615,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
724
615
  if params.solve_cloudflare:
725
616
  await self._solve_cloudflare(page_info.page)
726
617
  # Make sure the page is fully loaded after the captcha
727
- await page_info.page.wait_for_load_state(state="load")
728
- if params.load_dom:
729
- await page_info.page.wait_for_load_state(state="domcontentloaded")
730
- if params.network_idle:
731
- await page_info.page.wait_for_load_state("networkidle")
618
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
732
619
 
733
620
  if params.page_action:
734
621
  try:
@@ -741,11 +628,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
741
628
  waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
742
629
  await waiter.first.wait_for(state=params.wait_selector_state)
743
630
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
744
- await page_info.page.wait_for_load_state(state="load")
745
- if params.load_dom:
746
- await page_info.page.wait_for_load_state(state="domcontentloaded")
747
- if params.network_idle:
748
- await page_info.page.wait_for_load_state("networkidle")
631
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
749
632
  except Exception as e:
750
633
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
751
634
 
@@ -753,7 +636,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
753
636
 
754
637
  # Create response object
755
638
  response = await ResponseFactory.from_async_playwright_response(
756
- page_info.page, first_response, final_response, params.selector_config
639
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
757
640
  )
758
641
 
759
642
  # Close the page to free up resources
@@ -70,12 +70,17 @@ def _launch_kwargs(
70
70
  stealth,
71
71
  hide_canvas,
72
72
  disable_webgl,
73
+ extra_flags: Tuple,
73
74
  ) -> Tuple:
74
75
  """Creates the arguments we will use while launching playwright's browser"""
76
+ base_args = DEFAULT_FLAGS
77
+ if extra_flags:
78
+ base_args = base_args + extra_flags
79
+
75
80
  launch_kwargs = {
76
81
  "locale": locale,
77
82
  "headless": headless,
78
- "args": DEFAULT_FLAGS,
83
+ "args": base_args,
79
84
  "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
80
85
  "proxy": proxy or tuple(),
81
86
  "device_scale_factor": 2,
@@ -85,9 +90,10 @@ def _launch_kwargs(
85
90
  "user_agent": useragent or __default_useragent__,
86
91
  }
87
92
  if stealth:
93
+ stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
88
94
  launch_kwargs.update(
89
95
  {
90
- "args": DEFAULT_FLAGS + _set_flags(hide_canvas, disable_webgl),
96
+ "args": stealth_args,
91
97
  "chromium_sandbox": True,
92
98
  "is_mobile": False,
93
99
  "has_touch": False,