PlaywrightCapture 1.25.3__tar.gz → 1.25.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.3
3
+ Version: 1.25.4
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -682,8 +682,7 @@ class Capture():
682
682
  capturing_sub = False
683
683
  try:
684
684
  page = await self.context.new_page()
685
- await page.clock.install()
686
- page.on("dialog", lambda dialog: dialog.accept())
685
+ # await page.clock.install()
687
686
  except Error as e:
688
687
  self.logger.warning(f'The context is in a broken state: {e}')
689
688
  self.should_retry = True
@@ -707,6 +706,7 @@ class Capture():
707
706
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
708
707
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
709
708
  page.on("requestfinished", store_request)
709
+ page.on("dialog", lambda dialog: dialog.accept())
710
710
 
711
711
  try:
712
712
  # Parse the URL. If there is a fragment, we need to scroll to it manually
@@ -762,31 +762,31 @@ class Capture():
762
762
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
763
763
  self.logger.debug('Start instrumentation.')
764
764
 
765
- # ==== recaptcha
766
- # Same technique as: https://github.com/NikolaiT/uncaptcha3
767
- if CAN_SOLVE_CAPTCHA:
768
- try:
769
- if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
770
- and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
771
- self.logger.info('Found a captcha')
772
- await self._recaptcha_solver(page)
773
- except PlaywrightTimeoutError as e:
774
- self.logger.info(f'Captcha on {url} is not ready: {e}')
775
- except TargetClosedError as e:
776
- self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
777
- except Error as e:
778
- self.logger.warning(f'Error while resolving captcha on {url}: {e}')
779
- except Exception as e:
780
- self.logger.exception(f'General error with captcha solving on {url}: {e}')
781
- # ======
782
- # NOTE: testing
783
- # await self.__cloudflare_bypass_attempt(page)
784
- self.logger.debug('Done with captcha.')
785
-
786
765
  # check if we have anything on the page. If we don't, the page is not working properly.
787
766
  if await self._failsafe_get_content(page):
788
767
  self.logger.debug('Got rendered content')
789
768
 
769
+ # ==== recaptcha
770
+ # Same technique as: https://github.com/NikolaiT/uncaptcha3
771
+ if CAN_SOLVE_CAPTCHA:
772
+ try:
773
+ if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
774
+ and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
775
+ self.logger.info('Found a captcha')
776
+ await self._recaptcha_solver(page)
777
+ except PlaywrightTimeoutError as e:
778
+ self.logger.info(f'Captcha on {url} is not ready: {e}')
779
+ except TargetClosedError as e:
780
+ self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
781
+ except Error as e:
782
+ self.logger.warning(f'Error while resolving captcha on {url}: {e}')
783
+ except Exception as e:
784
+ self.logger.exception(f'General error with captcha solving on {url}: {e}')
785
+ # ======
786
+ # NOTE: testing
787
+ # await self.__cloudflare_bypass_attempt(page)
788
+ self.logger.debug('Done with captcha.')
789
+
790
790
  # move mouse
791
791
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
792
792
  self.logger.debug('Moved mouse.')
@@ -866,8 +866,12 @@ class Capture():
866
866
  to_return["downloaded_file"] = mem_zip.getvalue()
867
867
 
868
868
  # fast forward 30s
869
- await page.clock.run_for("30")
870
- self.logger.debug('Moved time forward.')
869
+ # try:
870
+ # async with timeout(3):
871
+ # await page.clock.run_for("47")
872
+ # self.logger.debug('Moved time forward.')
873
+ # except TimeoutError:
874
+ # self.logger.warning('Unable to move time forward.')
871
875
 
872
876
  self.logger.debug('Done with instrumentation, waiting for network idle.')
873
877
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
@@ -1078,8 +1082,9 @@ class Capture():
1078
1082
  tries = 3
1079
1083
  while tries:
1080
1084
  try:
1081
- return await page.content()
1082
- except Error:
1085
+ async with timeout(30):
1086
+ return await page.content()
1087
+ except (Error, TimeoutError):
1083
1088
  self.logger.debug('Unable to get page content, trying again.')
1084
1089
  tries -= 1
1085
1090
  await self._wait_for_random_timeout(page, 1)
@@ -1225,6 +1230,11 @@ class Capture():
1225
1230
  if ': ' in name:
1226
1231
  _, name = name.split(': ', maxsplit=1)
1227
1232
  exception._name = name.strip()
1233
+ else:
1234
+ # The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
1235
+ if ': ' in exception.message:
1236
+ _, name = exception.message.split(': ', maxsplit=1)
1237
+ exception._name = name.strip()
1228
1238
 
1229
1239
  def _exception_is_network_error(self, exception: Error) -> bool:
1230
1240
  if exception.name in [
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.3"
3
+ version = "1.25.4"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"