PlaywrightCapture 1.25.0__py3-none-any.whl → 1.25.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,10 +13,11 @@ import sys
13
13
  import time
14
14
 
15
15
  from base64 import b64decode
16
+ from dataclasses import dataclass
16
17
  from io import BytesIO
17
18
  from logging import LoggerAdapter, Logger
18
19
  from tempfile import NamedTemporaryFile
19
- from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
20
+ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
20
21
  from urllib.parse import urlparse, unquote, urljoin
21
22
  from zipfile import ZipFile
22
23
 
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
30
31
  from playwright._impl._errors import TargetClosedError
31
32
  from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
32
33
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
33
- from playwright_stealth import stealth_async # type: ignore[import-untyped]
34
+ from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
34
35
  from puremagic import PureError, from_string # type: ignore[import-untyped]
35
36
  from w3lib.html import strip_html5_whitespace
36
37
  from w3lib.url import canonicalize_url, safe_url_string
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
95
96
  return msg, kwargs
96
97
 
97
98
 
99
+ @dataclass
100
+ class PCStealthConfig(StealthConfig): # type: ignore[misc]
101
+
102
+ @property
103
+ def enabled_scripts(self) -> Generator[str, None, None]:
104
+ self.webdriver = True
105
+ self.webgl_vendor = True
106
+ self.chrome_app = True
107
+ self.chrome_csi = True
108
+ self.chrome_load_times = True
109
+ self.chrome_runtime = True
110
+ self.iframe_content_window = True
111
+ self.media_codecs = True
112
+ self.navigator_hardware_concurrency = 4
113
+ self.navigator_languages = False # Causes issue
114
+ self.navigator_permissions = True
115
+ self.navigator_platform = True
116
+ self.navigator_plugins = True
117
+ self.navigator_user_agent = False # Causes issues
118
+ self.navigator_vendor = False # Causes issues
119
+ self.outerdimensions = True
120
+ self.hairline = True
121
+ yield from super().enabled_scripts
122
+
123
+
98
124
  class Capture():
99
125
 
100
126
  _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
@@ -171,7 +197,8 @@ class Capture():
171
197
  raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
172
198
 
173
199
  self.browser = await self.playwright[self.browser_name].launch(
174
- proxy=self.proxy if self.proxy else None
200
+ proxy=self.proxy if self.proxy else None,
201
+ # headless=False
175
202
  )
176
203
 
177
204
  # Set of URLs that were captured in that context
@@ -385,6 +412,7 @@ class Capture():
385
412
  self.context = await self.browser.new_context(
386
413
  record_har_path=self._temp_harfile.name,
387
414
  ignore_https_errors=True,
415
+ bypass_csp=True,
388
416
  http_credentials=self.http_credentials if self.http_credentials else None,
389
417
  user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
390
418
  locale=self.locale if self.locale else None,
@@ -472,7 +500,7 @@ class Capture():
472
500
  async def handler() -> None:
473
501
  self.logger.debug('Didomi dialog found, clicking through.')
474
502
  if await page.locator("#didomi-notice-agree-button").is_visible():
475
- await page.locator("#didomi-notice-agree-button").click(timeout=2000)
503
+ await page.locator("#didomi-notice-agree-button").click(timeout=30000)
476
504
 
477
505
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
478
506
  self.logger.info('Didomi handler added')
@@ -654,6 +682,7 @@ class Capture():
654
682
  capturing_sub = False
655
683
  try:
656
684
  page = await self.context.new_page()
685
+ await page.clock.install()
657
686
  except Error as e:
658
687
  self.logger.warning(f'The context is in a broken state: {e}')
659
688
  self.should_retry = True
@@ -671,7 +700,9 @@ class Capture():
671
700
  await self.__dialog_alert_dialog_clickthrough(page)
672
701
  await self.__dialog_clickthrough(page)
673
702
 
674
- await stealth_async(page)
703
+ await stealth_async(page, PCStealthConfig())
704
+ # await stealth_async(page)
705
+
675
706
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
676
707
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
677
708
  page.on("requestfinished", store_request)
@@ -718,6 +749,7 @@ class Capture():
718
749
  else:
719
750
  raise initial_error
720
751
  else:
752
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
721
753
  try:
722
754
  await page.bring_to_front()
723
755
  self.logger.debug('Page moved to front.')
@@ -753,28 +785,34 @@ class Capture():
753
785
  # check if we have anything on the page. If we don't, the page is not working properly.
754
786
  if await self._failsafe_get_content(page):
755
787
  self.logger.debug('Got rendered content')
788
+
789
+ # move mouse
790
+ await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
791
+ self.logger.debug('Moved mouse.')
792
+ await self._wait_for_random_timeout(page, 5)
793
+ self.logger.debug('Keep going after moving mouse.')
794
+
756
795
  if allow_tracking:
757
- await self._wait_for_random_timeout(page, 2)
796
+ await self._wait_for_random_timeout(page, 5)
758
797
  # This event is required trigger the add_locator_handler
759
798
  try:
760
799
  if await page.locator("body").first.is_visible():
761
- await page.locator("body").first.click(button="right", timeout=5000)
800
+ self.logger.debug('Got body.')
801
+ await page.locator("body").first.click(button="right",
802
+ timeout=5000,
803
+ delay=50)
804
+ self.logger.debug('Clicked on body.')
762
805
  except Exception as e:
763
806
  self.logger.warning(f'Could not find body: {e}')
764
807
 
765
- # move mouse
766
- await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
767
- self.logger.debug('Moved mouse.')
768
- await self._wait_for_random_timeout(page, 2)
769
- self.logger.debug('Keep going after moving mouse.')
770
-
771
808
  if parsed_url.fragment:
772
809
  # We got a fragment, make sure we go to it and scroll only a little bit.
773
810
  fragment = unquote(parsed_url.fragment)
774
811
  try:
775
812
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
776
813
  await self._wait_for_random_timeout(page, 2)
777
- await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
814
+ async with timeout(3):
815
+ await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
778
816
  self.logger.debug('Jumped to fragment.')
779
817
  except PlaywrightTimeoutError as e:
780
818
  self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
@@ -782,14 +820,20 @@ class Capture():
782
820
  self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
783
821
  except Error as e:
784
822
  self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
823
+ except TimeoutError:
824
+ self.logger.debug('Unable to scroll due to timeout')
785
825
  else:
786
826
  # scroll more
787
827
  try:
788
- # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
789
- await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
828
+ # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
829
+ # 2024-07-08: Also, it sometimes get stuck.
830
+ async with timeout(3):
831
+ await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
790
832
  self.logger.debug('Scrolled down.')
791
833
  except Error as e:
792
834
  self.logger.debug(f'Unable to scroll: {e}')
835
+ except TimeoutError:
836
+ self.logger.debug('Unable to scroll due to timeout')
793
837
 
794
838
  await self._wait_for_random_timeout(page, 3)
795
839
  self.logger.debug('Keep going after moving on page.')
@@ -802,7 +846,6 @@ class Capture():
802
846
  self.logger.debug('PageDown on keyboard')
803
847
  except Error as e:
804
848
  self.logger.debug(f'Unable to use keyboard: {e}')
805
-
806
849
  if self.wait_for_download > 0:
807
850
  self.logger.info('Waiting for download to finish...')
808
851
  await self._safe_wait(page, 20)
@@ -821,6 +864,10 @@ class Capture():
821
864
  z.writestr(f'{i}_{filename}', file_content)
822
865
  to_return["downloaded_file"] = mem_zip.getvalue()
823
866
 
867
+ # fast forward 30s
868
+ await page.clock.run_for("30")
869
+ self.logger.debug('Moved time forward.')
870
+
824
871
  self.logger.debug('Done with instrumentation, waiting for network idle.')
825
872
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
826
873
  await self._safe_wait(page)
@@ -993,12 +1040,12 @@ class Capture():
993
1040
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
994
1041
  self.logger.debug("Capturing a screenshot of the full page.")
995
1042
  try:
996
- return await page.screenshot(full_page=True, timeout=5000)
1043
+ return await page.screenshot(full_page=True, timeout=10000)
997
1044
  except Error as e:
998
1045
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
999
1046
 
1000
1047
  try:
1001
- return await page.screenshot(full_page=True, scale="css", timeout=5000)
1048
+ return await page.screenshot(full_page=True, scale="css", timeout=30000)
1002
1049
  except Error as e:
1003
1050
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
1004
1051
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.0
3
+ Version: 1.25.2
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.25,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=70.2.0,<71.0.0)
34
+ Requires-Dist: setuptools (>=70.3.0,<71.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -1,9 +1,9 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=zzoZQItpKDbxpfF0PqFANfeWTmQlSwnvChuz_l1Ah-I,67333
2
+ playwrightcapture/capture.py,sha256=Yv6c7FpsK2EqHBJUFE51oy55uGDkKsirJ2CFFx7xs2g,69385
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.0.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.0.dist-info/METADATA,sha256=XBYGqQxi3Qvc-ktd1lLGFBfSKRmCLkH5UkbzNPeL8kA,3173
8
- playwrightcapture-1.25.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.0.dist-info/RECORD,,
6
+ playwrightcapture-1.25.2.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.2.dist-info/METADATA,sha256=rme0VEHC3FEhh8kLA4ITWx4SWxekk1zHbqcGmods1_E,3173
8
+ playwrightcapture-1.25.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.2.dist-info/RECORD,,