PlaywrightCapture 1.25.3__py3-none-any.whl → 1.25.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -165,10 +165,15 @@ class Capture():
165
165
  if proxy:
166
166
  if isinstance(proxy, str):
167
167
  self.proxy = {'server': proxy}
168
- else:
168
+ elif isinstance(proxy, dict):
169
169
  self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
170
170
  'username': proxy.get('username', ''),
171
171
  'password': proxy.get('password', '')}
172
+ elif isinstance(proxy, int):
173
+ # This is clearly a mistake, just ignoring it
174
+ self.logger.warning('Proxy is an integer, this is a mistake, ignoring it.')
175
+ else:
176
+ raise InvalidPlaywrightParameter(f'Invalid proxy parameter: "{proxy}" ({type(proxy)})')
172
177
 
173
178
  self.should_retry: bool = False
174
179
  self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
@@ -682,8 +687,7 @@ class Capture():
682
687
  capturing_sub = False
683
688
  try:
684
689
  page = await self.context.new_page()
685
- await page.clock.install()
686
- page.on("dialog", lambda dialog: dialog.accept())
690
+ # await page.clock.install()
687
691
  except Error as e:
688
692
  self.logger.warning(f'The context is in a broken state: {e}')
689
693
  self.should_retry = True
@@ -707,6 +711,7 @@ class Capture():
707
711
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
708
712
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
709
713
  page.on("requestfinished", store_request)
714
+ page.on("dialog", lambda dialog: dialog.accept())
710
715
 
711
716
  try:
712
717
  # Parse the URL. If there is a fragment, we need to scroll to it manually
@@ -762,31 +767,31 @@ class Capture():
762
767
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
763
768
  self.logger.debug('Start instrumentation.')
764
769
 
765
- # ==== recaptcha
766
- # Same technique as: https://github.com/NikolaiT/uncaptcha3
767
- if CAN_SOLVE_CAPTCHA:
768
- try:
769
- if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
770
- and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
771
- self.logger.info('Found a captcha')
772
- await self._recaptcha_solver(page)
773
- except PlaywrightTimeoutError as e:
774
- self.logger.info(f'Captcha on {url} is not ready: {e}')
775
- except TargetClosedError as e:
776
- self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
777
- except Error as e:
778
- self.logger.warning(f'Error while resolving captcha on {url}: {e}')
779
- except Exception as e:
780
- self.logger.exception(f'General error with captcha solving on {url}: {e}')
781
- # ======
782
- # NOTE: testing
783
- # await self.__cloudflare_bypass_attempt(page)
784
- self.logger.debug('Done with captcha.')
785
-
786
770
  # check if we have anything on the page. If we don't, the page is not working properly.
787
771
  if await self._failsafe_get_content(page):
788
772
  self.logger.debug('Got rendered content')
789
773
 
774
+ # ==== recaptcha
775
+ # Same technique as: https://github.com/NikolaiT/uncaptcha3
776
+ if CAN_SOLVE_CAPTCHA:
777
+ try:
778
+ if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
779
+ and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
780
+ self.logger.info('Found a captcha')
781
+ await self._recaptcha_solver(page)
782
+ except PlaywrightTimeoutError as e:
783
+ self.logger.info(f'Captcha on {url} is not ready: {e}')
784
+ except TargetClosedError as e:
785
+ self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
786
+ except Error as e:
787
+ self.logger.warning(f'Error while resolving captcha on {url}: {e}')
788
+ except Exception as e:
789
+ self.logger.exception(f'General error with captcha solving on {url}: {e}')
790
+ # ======
791
+ # NOTE: testing
792
+ # await self.__cloudflare_bypass_attempt(page)
793
+ self.logger.debug('Done with captcha.')
794
+
790
795
  # move mouse
791
796
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
792
797
  self.logger.debug('Moved mouse.')
@@ -812,7 +817,7 @@ class Capture():
812
817
  try:
813
818
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
814
819
  await self._wait_for_random_timeout(page, 2)
815
- async with timeout(3):
820
+ async with timeout(5):
816
821
  await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
817
822
  self.logger.debug('Jumped to fragment.')
818
823
  except PlaywrightTimeoutError as e:
@@ -821,20 +826,24 @@ class Capture():
821
826
  self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
822
827
  except Error as e:
823
828
  self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
824
- except TimeoutError:
829
+ except (asyncio.TimeoutError, TimeoutError):
825
830
  self.logger.debug('Unable to scroll due to timeout')
831
+ except (asyncio.CancelledError):
832
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
826
833
  else:
827
834
  # scroll more
828
835
  try:
829
836
  # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
830
837
  # 2024-07-08: Also, it sometimes get stuck.
831
- async with timeout(3):
838
+ async with timeout(5):
832
839
  await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
833
840
  self.logger.debug('Scrolled down.')
834
841
  except Error as e:
835
842
  self.logger.debug(f'Unable to scroll: {e}')
836
- except TimeoutError:
843
+ except (TimeoutError, asyncio.TimeoutError):
837
844
  self.logger.debug('Unable to scroll due to timeout')
845
+ except (asyncio.CancelledError):
846
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
838
847
 
839
848
  await self._wait_for_random_timeout(page, 3)
840
849
  self.logger.debug('Keep going after moving on page.')
@@ -866,8 +875,12 @@ class Capture():
866
875
  to_return["downloaded_file"] = mem_zip.getvalue()
867
876
 
868
877
  # fast forward 30s
869
- await page.clock.run_for("30")
870
- self.logger.debug('Moved time forward.')
878
+ # try:
879
+ # async with timeout(3):
880
+ # await page.clock.run_for("47")
881
+ # self.logger.debug('Moved time forward.')
882
+ # except (TimeoutError, asyncio.TimeoutError):
883
+ # self.logger.warning('Unable to move time forward.')
871
884
 
872
885
  self.logger.debug('Done with instrumentation, waiting for network idle.')
873
886
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
@@ -915,7 +928,7 @@ class Capture():
915
928
  rendered_hostname_only=rendered_hostname_only,
916
929
  max_depth_capture_time=max_capture_time)
917
930
  to_return['children'].append(child_capture) # type: ignore[union-attr]
918
- except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
931
+ except (TimeoutError, asyncio.TimeoutError):
919
932
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
920
933
  consecutive_errors += 1
921
934
  except Exception as e:
@@ -943,6 +956,9 @@ class Capture():
943
956
  except PlaywrightTimeoutError as e:
944
957
  to_return['error'] = f"The capture took too long - {e.message}"
945
958
  self.should_retry = True
959
+ except (asyncio.TimeoutError, TimeoutError):
960
+ to_return['error'] = "Something in the capture took too long"
961
+ self.should_retry = True
946
962
  except TargetClosedError as e:
947
963
  to_return['error'] = f"The target was closed - {e}"
948
964
  self.should_retry = True
@@ -1078,8 +1094,9 @@ class Capture():
1078
1094
  tries = 3
1079
1095
  while tries:
1080
1096
  try:
1081
- return await page.content()
1082
- except Error:
1097
+ async with timeout(30):
1098
+ return await page.content()
1099
+ except (Error, TimeoutError):
1083
1100
  self.logger.debug('Unable to get page content, trying again.')
1084
1101
  tries -= 1
1085
1102
  await self._wait_for_random_timeout(page, 1)
@@ -1225,6 +1242,11 @@ class Capture():
1225
1242
  if ': ' in name:
1226
1243
  _, name = name.split(': ', maxsplit=1)
1227
1244
  exception._name = name.strip()
1245
+ else:
1246
+ # The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
1247
+ if ': ' in exception.message:
1248
+ _, name = exception.message.split(': ', maxsplit=1)
1249
+ exception._name = name.strip()
1228
1250
 
1229
1251
  def _exception_is_network_error(self, exception: Error) -> bool:
1230
1252
  if exception.name in [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.3
3
+ Version: 1.25.5
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -1,9 +1,9 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=6OzlAFUE7fzywSfd5E6d_BvKMN0gi7vNNXsp0SBbyak,69419
2
+ playwrightcapture/capture.py,sha256=rPzmotG1_o-SS0I6oJbMPnVoVweSS_BQ-zUdDaInEsg,70897
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.3.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.3.dist-info/METADATA,sha256=LjyYrFwB8WzSAldFOmKeSuci_wdI8DDd45jHn8B-MyY,3173
8
- playwrightcapture-1.25.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.3.dist-info/RECORD,,
6
+ playwrightcapture-1.25.5.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.5.dist-info/METADATA,sha256=KfTBTvcCfRoKhAZVGUXjoYfXODyrBFc0iWjTZeWwZ-I,3173
8
+ playwrightcapture-1.25.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.5.dist-info/RECORD,,