PlaywrightCapture 1.23.11__tar.gz → 1.23.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.23.11
3
+ Version: 1.23.13
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -122,7 +122,7 @@ class Capture():
122
122
  'password': proxy.get('password', '')}
123
123
 
124
124
  self.should_retry: bool = False
125
- self.__network_not_idle: int = 1
125
+ self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
126
126
  self._cookies: list[SetCookieParam] = []
127
127
  self._http_credentials: HttpCredentials = {}
128
128
  self._geolocation: Geolocation = {}
@@ -461,7 +461,7 @@ class Capture():
461
461
  await page.locator("#onetrust-accept-btn-handler").click()
462
462
 
463
463
  await page.add_locator_handler(
464
- page.locator('.ot-sdk-container'),
464
+ page.locator('#onetrust-banner-sdk'),
465
465
  handler
466
466
  )
467
467
  self.logger.info('OT handler added')
@@ -524,6 +524,7 @@ class Capture():
524
524
  ) -> CaptureResponse:
525
525
 
526
526
  to_return: CaptureResponse = {}
527
+ got_favicons = False
527
528
 
528
529
  # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
529
530
  # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't
@@ -551,8 +552,12 @@ class Capture():
551
552
 
552
553
  async def store_request(request: Request) -> None:
553
554
  # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL
555
+ if got_favicons:
556
+ return
554
557
  try:
555
558
  if response := await request.response():
559
+ if got_favicons:
560
+ return
556
561
  if response.ok:
557
562
  try:
558
563
  if body := await response.body():
@@ -585,7 +590,7 @@ class Capture():
585
590
  await stealth_async(page)
586
591
  page.set_default_timeout(self._capture_timeout * 1000)
587
592
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
588
- page.on("request", store_request)
593
+ page.on("requestfinished", store_request)
589
594
 
590
595
  try:
591
596
  # Parse the URL. If there is a fragment, we need to scroll to it manually
@@ -645,8 +650,8 @@ class Capture():
645
650
  # Same technique as: https://github.com/NikolaiT/uncaptcha3
646
651
  if CAN_SOLVE_CAPTCHA:
647
652
  try:
648
- if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=5000)
649
- and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=5000)):
653
+ if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
654
+ and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
650
655
  self.logger.info('Found a captcha')
651
656
  await self._recaptcha_solver(page)
652
657
  except PlaywrightTimeoutError as e:
@@ -675,7 +680,7 @@ class Capture():
675
680
  # We got a fragment, make sure we go to it and scroll only a little bit.
676
681
  fragment = unquote(parsed_url.fragment)
677
682
  try:
678
- await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=5000)
683
+ await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
679
684
  await self._safe_wait(page)
680
685
  await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
681
686
  self.logger.debug('Jumped to fragment.')
@@ -717,10 +722,12 @@ class Capture():
717
722
 
718
723
  to_return['last_redirected_url'] = page.url
719
724
 
720
- to_return['png'] = await self._failsafe_get_screenshot(page)
721
-
722
725
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
723
726
  to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
727
+ got_favicons = True
728
+
729
+ await self._safe_wait(page)
730
+ to_return['png'] = await self._failsafe_get_screenshot(page)
724
731
 
725
732
  if self.wait_for_download > 0:
726
733
  self.logger.info('Waiting for download to finish...')
@@ -853,17 +860,17 @@ class Capture():
853
860
 
854
861
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
855
862
  try:
856
- return await page.screenshot(full_page=True)
863
+ return await page.screenshot(full_page=True, timeout=5000)
857
864
  except Error as e:
858
865
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
859
866
 
860
867
  try:
861
- return await page.screenshot(full_page=True, scale="css")
868
+ return await page.screenshot(full_page=True, scale="css", timeout=10000)
862
869
  except Error as e:
863
870
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
864
871
 
865
872
  try:
866
- return await page.screenshot()
873
+ return await page.screenshot(scale="css", animations='disabled', timeout=10000)
867
874
  except Error as e:
868
875
  self.logger.warning(f"Unable to get any screenshot: {e}")
869
876
  raise e
@@ -871,7 +878,7 @@ class Capture():
871
878
  async def _safe_wait(self, page: Page) -> None:
872
879
  try:
873
880
  # If we don't have networkidle relatively quick, it's probably because we're playing a video.
874
- await page.wait_for_load_state('networkidle', timeout=10000 / self.__network_not_idle)
881
+ await page.wait_for_load_state('networkidle', timeout=self._capture_timeout / self.__network_not_idle)
875
882
  except PlaywrightTimeoutError:
876
883
  # Network never idle, keep going
877
884
  self.__network_not_idle += 1
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.23.11"
3
+ version = "1.23.13"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"