PlaywrightCapture 1.25.2__tar.gz → 1.25.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.2
3
+ Version: 1.25.4
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -28,7 +28,7 @@ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
29
  Requires-Dist: playwright (>=1.45.0,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
- Requires-Dist: puremagic (>=1.25,<2.0)
31
+ Requires-Dist: puremagic (>=1.26,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
34
  Requires-Dist: setuptools (>=70.3.0,<71.0.0)
@@ -32,7 +32,7 @@ from playwright._impl._errors import TargetClosedError
32
32
  from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
33
33
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
34
34
  from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
35
- from puremagic import PureError, from_string # type: ignore[import-untyped]
35
+ from puremagic import PureError, from_string
36
36
  from w3lib.html import strip_html5_whitespace
37
37
  from w3lib.url import canonicalize_url, safe_url_string
38
38
 
@@ -682,7 +682,7 @@ class Capture():
682
682
  capturing_sub = False
683
683
  try:
684
684
  page = await self.context.new_page()
685
- await page.clock.install()
685
+ # await page.clock.install()
686
686
  except Error as e:
687
687
  self.logger.warning(f'The context is in a broken state: {e}')
688
688
  self.should_retry = True
@@ -706,6 +706,7 @@ class Capture():
706
706
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
707
707
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
708
708
  page.on("requestfinished", store_request)
709
+ page.on("dialog", lambda dialog: dialog.accept())
709
710
 
710
711
  try:
711
712
  # Parse the URL. If there is a fragment, we need to scroll to it manually
@@ -761,31 +762,31 @@ class Capture():
761
762
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
762
763
  self.logger.debug('Start instrumentation.')
763
764
 
764
- # ==== recaptcha
765
- # Same technique as: https://github.com/NikolaiT/uncaptcha3
766
- if CAN_SOLVE_CAPTCHA:
767
- try:
768
- if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
769
- and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
770
- self.logger.info('Found a captcha')
771
- await self._recaptcha_solver(page)
772
- except PlaywrightTimeoutError as e:
773
- self.logger.info(f'Captcha on {url} is not ready: {e}')
774
- except TargetClosedError as e:
775
- self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
776
- except Error as e:
777
- self.logger.warning(f'Error while resolving captcha on {url}: {e}')
778
- except Exception as e:
779
- self.logger.exception(f'General error with captcha solving on {url}: {e}')
780
- # ======
781
- # NOTE: testing
782
- # await self.__cloudflare_bypass_attempt(page)
783
- self.logger.debug('Done with captcha.')
784
-
785
765
  # check if we have anything on the page. If we don't, the page is not working properly.
786
766
  if await self._failsafe_get_content(page):
787
767
  self.logger.debug('Got rendered content')
788
768
 
769
+ # ==== recaptcha
770
+ # Same technique as: https://github.com/NikolaiT/uncaptcha3
771
+ if CAN_SOLVE_CAPTCHA:
772
+ try:
773
+ if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
774
+ and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
775
+ self.logger.info('Found a captcha')
776
+ await self._recaptcha_solver(page)
777
+ except PlaywrightTimeoutError as e:
778
+ self.logger.info(f'Captcha on {url} is not ready: {e}')
779
+ except TargetClosedError as e:
780
+ self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
781
+ except Error as e:
782
+ self.logger.warning(f'Error while resolving captcha on {url}: {e}')
783
+ except Exception as e:
784
+ self.logger.exception(f'General error with captcha solving on {url}: {e}')
785
+ # ======
786
+ # NOTE: testing
787
+ # await self.__cloudflare_bypass_attempt(page)
788
+ self.logger.debug('Done with captcha.')
789
+
789
790
  # move mouse
790
791
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
791
792
  self.logger.debug('Moved mouse.')
@@ -865,8 +866,12 @@ class Capture():
865
866
  to_return["downloaded_file"] = mem_zip.getvalue()
866
867
 
867
868
  # fast forward 30s
868
- await page.clock.run_for("30")
869
- self.logger.debug('Moved time forward.')
869
+ # try:
870
+ # async with timeout(3):
871
+ # await page.clock.run_for("47")
872
+ # self.logger.debug('Moved time forward.')
873
+ # except TimeoutError:
874
+ # self.logger.warning('Unable to move time forward.')
870
875
 
871
876
  self.logger.debug('Done with instrumentation, waiting for network idle.')
872
877
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
@@ -1077,8 +1082,9 @@ class Capture():
1077
1082
  tries = 3
1078
1083
  while tries:
1079
1084
  try:
1080
- return await page.content()
1081
- except Error:
1085
+ async with timeout(30):
1086
+ return await page.content()
1087
+ except (Error, TimeoutError):
1082
1088
  self.logger.debug('Unable to get page content, trying again.')
1083
1089
  tries -= 1
1084
1090
  await self._wait_for_random_timeout(page, 1)
@@ -1224,6 +1230,11 @@ class Capture():
1224
1230
  if ': ' in name:
1225
1231
  _, name = name.split(': ', maxsplit=1)
1226
1232
  exception._name = name.strip()
1233
+ else:
1234
+ # The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
1235
+ if ': ' in exception.message:
1236
+ _, name = exception.message.split(': ', maxsplit=1)
1237
+ exception._name = name.strip()
1227
1238
 
1228
1239
  def _exception_is_network_error(self, exception: Error) -> bool:
1229
1240
  if exception.name in [
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.2"
3
+ version = "1.25.4"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -29,7 +29,7 @@ pytz = {"version" = "^2024.1", python = "<3.9"}
29
29
  tzdata = "^2024.1"
30
30
  playwright-stealth = "^1.0.6"
31
31
  setuptools = "^70.3.0"
32
- puremagic = "^1.25"
32
+ puremagic = "^1.26"
33
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
34
  aiohttp = {extras = ["speedups"], version = "^3.9.5"}
35
35
  aiohttp-socks = "^0.8.4"