PlaywrightCapture 1.27.7__py3-none-any.whl → 1.27.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -138,8 +138,8 @@ class Capture():
138
138
 
139
139
  def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
140
140
  proxy: str | dict[str, str] | None=None,
141
- general_timeout_in_sec: int | None = None, loglevel: str | int='INFO',
142
- uuid: str | None=None):
141
+ general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
142
+ uuid: str | None=None, headless: bool=True):
143
143
  """Captures a page with Playwright.
144
144
 
145
145
  :param browser: The browser to use for the capture.
@@ -148,6 +148,7 @@ class Capture():
148
148
  :param general_timeout_in_sec: The general timeout for the capture, including children.
149
149
  :param loglevel: Python loglevel
150
150
  :param uuid: The UUID of the capture.
151
+ :param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment.
151
152
  """
152
153
  master_logger = logging.getLogger('playwrightcapture')
153
154
  master_logger.setLevel(loglevel)
@@ -167,6 +168,7 @@ class Capture():
167
168
  self._capture_timeout = self._minimal_timeout
168
169
 
169
170
  self.device_name: str | None = device_name
171
+ self.headless: bool = headless
170
172
  self.proxy: ProxySettings = {}
171
173
  if proxy:
172
174
  if isinstance(proxy, str):
@@ -224,7 +226,7 @@ class Capture():
224
226
  self.browser = await self.playwright[self.browser_name].launch(
225
227
  proxy=self.proxy if self.proxy else None,
226
228
  channel="chromium" if self.browser_name == "chromium" else None,
227
- # headless=False
229
+ headless=self.headless
228
230
  )
229
231
 
230
232
  # Set of URLs that were captured in that context
@@ -755,6 +757,144 @@ class Capture():
755
757
  self.logger.debug(f'Moved time forward by ~{time}s.')
756
758
  except (TimeoutError, asyncio.TimeoutError):
757
759
  self.logger.info('Unable to move time forward.')
760
+ except Exception as e:
761
+ self.logger.info(f'Error while moving time forward: {e}')
762
+
763
+ async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, clock_set: bool) -> None:
764
+ # page instrumentation
765
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
766
+ self.logger.debug('Start instrumentation.')
767
+
768
+ # check if we have anything on the page. If we don't, the page is not working properly.
769
+ if await self._failsafe_get_content(page):
770
+ self.logger.debug('Got rendered content')
771
+
772
+ # ==== recaptcha
773
+ # Same technique as: https://github.com/NikolaiT/uncaptcha3
774
+ if CAN_SOLVE_CAPTCHA:
775
+ try:
776
+ if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
777
+ and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
778
+ self.logger.info('Found a captcha')
779
+ await self._recaptcha_solver(page)
780
+ except PlaywrightTimeoutError as e:
781
+ self.logger.info(f'Captcha on {url} is not ready: {e}')
782
+ except TargetClosedError as e:
783
+ self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
784
+ except Error as e:
785
+ self.logger.warning(f'Error while resolving captcha on {url}: {e}')
786
+ except (TimeoutError, asyncio.TimeoutError) as e:
787
+ self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
788
+ except Exception as e:
789
+ self.logger.exception(f'General error with captcha solving on {url}: {e}')
790
+ # ======
791
+ # NOTE: testing
792
+ # await self.__cloudflare_bypass_attempt(page)
793
+ self.logger.debug('Done with captcha.')
794
+
795
+ # move mouse
796
+ try:
797
+ async with timeout(5):
798
+ await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
799
+ self.logger.debug('Moved mouse.')
800
+ except (asyncio.TimeoutError, TimeoutError):
801
+ self.logger.debug('Moving the mouse caused a timeout.')
802
+
803
+ await self._wait_for_random_timeout(page, 5)
804
+ self.logger.debug('Keep going after moving mouse.')
805
+
806
+ if allow_tracking:
807
+ await self._wait_for_random_timeout(page, 5)
808
+ # This event is required trigger the add_locator_handler
809
+ try:
810
+ if await page.locator("body").first.is_visible():
811
+ self.logger.debug('Got body.')
812
+ await page.locator("body").first.click(button="right",
813
+ timeout=5000,
814
+ delay=50)
815
+ self.logger.debug('Clicked on body.')
816
+ except Exception as e:
817
+ self.logger.warning(f'Could not find body: {e}')
818
+
819
+ await self._wait_for_random_timeout(page, 5)
820
+ # triggering clicks on very generic frames is sometimes impossible, using button and common language.
821
+ self.logger.debug('Check other frames for button')
822
+ for frame in page.frames:
823
+ if await self.__frame_consent(frame):
824
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
825
+ self.logger.debug('Done with frames.')
826
+
827
+ self.logger.debug('Check main frame for button')
828
+ if await self.__frame_consent(page.main_frame):
829
+ self.logger.debug('Got button on main frame')
830
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
831
+
832
+ if clock_set:
833
+ await self._move_time_forward(page, 10)
834
+
835
+ # Parse the URL. If there is a fragment, we need to scroll to it manually
836
+ parsed_url = urlparse(url, allow_fragments=True)
837
+
838
+ if parsed_url.fragment:
839
+ # We got a fragment, make sure we go to it and scroll only a little bit.
840
+ fragment = unquote(parsed_url.fragment)
841
+ try:
842
+ await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
843
+ await self._wait_for_random_timeout(page, 2)
844
+ async with timeout(5):
845
+ await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
846
+ self.logger.debug('Jumped to fragment.')
847
+ except PlaywrightTimeoutError as e:
848
+ self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
849
+ except TargetClosedError as e:
850
+ self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
851
+ except Error as e:
852
+ self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
853
+ except (asyncio.TimeoutError, TimeoutError):
854
+ self.logger.debug('Unable to scroll due to timeout')
855
+ except (asyncio.CancelledError):
856
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
857
+ else:
858
+ # scroll more
859
+ try:
860
+ # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
861
+ # 2024-07-08: Also, it sometimes get stuck.
862
+ async with timeout(5):
863
+ await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
864
+ self.logger.debug('Scrolled down.')
865
+ except Error as e:
866
+ self.logger.debug(f'Unable to scroll: {e}')
867
+ except (TimeoutError, asyncio.TimeoutError):
868
+ self.logger.debug('Unable to scroll due to timeout')
869
+ except (asyncio.CancelledError):
870
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
871
+
872
+ await self._wait_for_random_timeout(page, 3)
873
+ self.logger.debug('Keep going after moving on page.')
874
+
875
+ try:
876
+ async with timeout(5):
877
+ await page.keyboard.press('PageUp')
878
+ self.logger.debug('PageUp on keyboard')
879
+ await self._wait_for_random_timeout(page, 3)
880
+ await page.keyboard.press('PageDown')
881
+ self.logger.debug('PageDown on keyboard')
882
+ except (asyncio.TimeoutError, TimeoutError):
883
+ self.logger.debug('Using keyboard caused a timeout.')
884
+ except Error as e:
885
+ self.logger.debug(f'Unable to use keyboard: {e}')
886
+ if self.wait_for_download > 0:
887
+ self.logger.info('Waiting for download to finish...')
888
+ await self._safe_wait(page, 20)
889
+
890
+ if clock_set:
891
+ # fast forward ~30s
892
+ await self._move_time_forward(page, 30)
893
+
894
+ self.logger.debug('Done with instrumentation, waiting for network idle.')
895
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
896
+ await self._safe_wait(page)
897
+ self.logger.debug('Done with instrumentation, done with waiting.')
758
898
 
759
899
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
760
900
  referer: str | None=None,
@@ -858,9 +998,6 @@ class Capture():
858
998
  page.on("dialog", lambda dialog: dialog.accept())
859
999
 
860
1000
  try:
861
- # Parse the URL. If there is a fragment, we need to scroll to it manually
862
- parsed_url = urlparse(url, allow_fragments=True)
863
-
864
1001
  try:
865
1002
  await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
866
1003
  page.on("download", handle_download)
@@ -906,128 +1043,14 @@ class Capture():
906
1043
  except Error as e:
907
1044
  self.logger.warning(f'Unable to bring the page to the front: {e}.')
908
1045
 
909
- # page instrumentation
910
- await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
911
- self.logger.debug('Start instrumentation.')
912
-
913
- # check if we have anything on the page. If we don't, the page is not working properly.
914
- if await self._failsafe_get_content(page):
915
- self.logger.debug('Got rendered content')
916
-
917
- # ==== recaptcha
918
- # Same technique as: https://github.com/NikolaiT/uncaptcha3
919
- if CAN_SOLVE_CAPTCHA:
920
- try:
921
- if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
922
- and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
923
- self.logger.info('Found a captcha')
924
- await self._recaptcha_solver(page)
925
- except PlaywrightTimeoutError as e:
926
- self.logger.info(f'Captcha on {url} is not ready: {e}')
927
- except TargetClosedError as e:
928
- self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
929
- except Error as e:
930
- self.logger.warning(f'Error while resolving captcha on {url}: {e}')
931
- except (TimeoutError, asyncio.TimeoutError) as e:
932
- self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
933
- except Exception as e:
934
- self.logger.exception(f'General error with captcha solving on {url}: {e}')
935
- # ======
936
- # NOTE: testing
937
- # await self.__cloudflare_bypass_attempt(page)
938
- self.logger.debug('Done with captcha.')
939
-
940
- # move mouse
941
- try:
942
- async with timeout(5):
943
- await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
944
- self.logger.debug('Moved mouse.')
945
- except (asyncio.TimeoutError, TimeoutError):
946
- self.logger.debug('Moving the mouse caused a timeout.')
947
-
948
- await self._wait_for_random_timeout(page, 5)
949
- self.logger.debug('Keep going after moving mouse.')
950
-
951
- if allow_tracking:
952
- await self._wait_for_random_timeout(page, 5)
953
- # This event is required trigger the add_locator_handler
954
- try:
955
- if await page.locator("body").first.is_visible():
956
- self.logger.debug('Got body.')
957
- await page.locator("body").first.click(button="right",
958
- timeout=5000,
959
- delay=50)
960
- self.logger.debug('Clicked on body.')
961
- except Exception as e:
962
- self.logger.warning(f'Could not find body: {e}')
963
-
964
- await self._wait_for_random_timeout(page, 5)
965
- # triggering clicks on very generic frames is sometimes impossible, using button and common language.
966
- self.logger.debug('Check other frames for button')
967
- for frame in page.frames:
968
- if await self.__frame_consent(frame):
969
- await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
970
- self.logger.debug('Done with frames.')
971
-
972
- self.logger.debug('Check main frame for button')
973
- if await self.__frame_consent(page.main_frame):
974
- self.logger.debug('Got button on main frame')
975
- await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
976
-
977
- if clock_set:
978
- await self._move_time_forward(page, 10)
979
-
980
- if parsed_url.fragment:
981
- # We got a fragment, make sure we go to it and scroll only a little bit.
982
- fragment = unquote(parsed_url.fragment)
983
- try:
984
- await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
985
- await self._wait_for_random_timeout(page, 2)
986
- async with timeout(5):
987
- await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
988
- self.logger.debug('Jumped to fragment.')
989
- except PlaywrightTimeoutError as e:
990
- self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
991
- except TargetClosedError as e:
992
- self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
993
- except Error as e:
994
- self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
995
- except (asyncio.TimeoutError, TimeoutError):
996
- self.logger.debug('Unable to scroll due to timeout')
997
- except (asyncio.CancelledError):
998
- self.logger.debug('Unable to scroll due to timeout, call canceled')
1046
+ try:
1047
+ if self.headless:
1048
+ await self.__instrumentation(page, url, allow_tracking, clock_set)
999
1049
  else:
1000
- # scroll more
1001
- try:
1002
- # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
1003
- # 2024-07-08: Also, it sometimes get stuck.
1004
- async with timeout(5):
1005
- await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
1006
- self.logger.debug('Scrolled down.')
1007
- except Error as e:
1008
- self.logger.debug(f'Unable to scroll: {e}')
1009
- except (TimeoutError, asyncio.TimeoutError):
1010
- self.logger.debug('Unable to scroll due to timeout')
1011
- except (asyncio.CancelledError):
1012
- self.logger.debug('Unable to scroll due to timeout, call canceled')
1013
-
1014
- await self._wait_for_random_timeout(page, 3)
1015
- self.logger.debug('Keep going after moving on page.')
1016
-
1017
- try:
1018
- async with timeout(5):
1019
- await page.keyboard.press('PageUp')
1020
- self.logger.debug('PageUp on keyboard')
1021
- await self._wait_for_random_timeout(page, 3)
1022
- await page.keyboard.press('PageDown')
1023
- self.logger.debug('PageDown on keyboard')
1024
- except (asyncio.TimeoutError, TimeoutError):
1025
- self.logger.debug('Using keyboard caused a timeout.')
1026
- except Error as e:
1027
- self.logger.debug(f'Unable to use keyboard: {e}')
1028
- if self.wait_for_download > 0:
1029
- self.logger.info('Waiting for download to finish...')
1030
- await self._safe_wait(page, 20)
1050
+ self.logger.debug('Headed mode, skipping instrumentation.')
1051
+ await self._wait_for_random_timeout(page, self._capture_timeout - 5)
1052
+ except Exception as e:
1053
+ self.logger.exception(f'Error during instrumentation: {e}')
1031
1054
 
1032
1055
  if multiple_downloads:
1033
1056
  if len(multiple_downloads) == 1:
@@ -1043,16 +1066,6 @@ class Capture():
1043
1066
  z.writestr(f'{i}_{filename}', file_content)
1044
1067
  to_return["downloaded_file"] = mem_zip.getvalue()
1045
1068
 
1046
- if clock_set:
1047
- # fast forward ~30s
1048
- await self._move_time_forward(page, 30)
1049
-
1050
- self.logger.debug('Done with instrumentation, waiting for network idle.')
1051
- await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
1052
- await self._safe_wait(page)
1053
-
1054
- self.logger.debug('Done with instrumentation, done with waiting.')
1055
-
1056
1069
  if content := await self._failsafe_get_content(page):
1057
1070
  to_return['html'] = content
1058
1071
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.27.7
3
+ Version: 1.27.9
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -20,10 +20,10 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.1) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.11.11)
23
+ Requires-Dist: aiohttp[speedups] (>=3.11.12)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.1)
26
- Requires-Dist: dateparser (>=1.2.0)
25
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
26
+ Requires-Dist: dateparser (>=1.2.1)
27
27
  Requires-Dist: playwright (>=1.50.0)
28
28
  Requires-Dist: playwright-stealth (>=1.0.6)
29
29
  Requires-Dist: puremagic (>=1.28)
@@ -0,0 +1,9 @@
1
+ playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
+ playwrightcapture/capture.py,sha256=0jYIAx5EmdZKGfGaeGhGorc9wIAzDHhgBaBjMkuV5kI,80780
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture-1.27.9.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.27.9.dist-info/METADATA,sha256=H5GDI-738jL1XkZi8bAWegZu3Pcx6tRL8Tdhv8IP-20,2998
8
+ playwrightcapture-1.27.9.dist-info/WHEEL,sha256=7dDg4QLnNKTvwIDR9Ac8jJaAmBC_owJrckbC0jjThyA,88
9
+ playwrightcapture-1.27.9.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.0.1
2
+ Generator: poetry-core 2.1.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,9 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=7Dzb909bGdLD5vxdx2U-nD5Pikf6gWbpJL5eIfG7734,80856
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.27.7.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.27.7.dist-info/METADATA,sha256=yhTawjHSSyKK0XqUUC6j-MdrTcnWZbPtKxhOb-EOX54,2998
8
- playwrightcapture-1.27.7.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
9
- playwrightcapture-1.27.7.dist-info/RECORD,,