PlaywrightCapture 1.25.0__tar.gz → 1.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.0
3
+ Version: 1.25.1
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.25,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=70.2.0,<71.0.0)
34
+ Requires-Dist: setuptools (>=70.3.0,<71.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -13,10 +13,11 @@ import sys
13
13
  import time
14
14
 
15
15
  from base64 import b64decode
16
+ from dataclasses import dataclass
16
17
  from io import BytesIO
17
18
  from logging import LoggerAdapter, Logger
18
19
  from tempfile import NamedTemporaryFile
19
- from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
20
+ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
20
21
  from urllib.parse import urlparse, unquote, urljoin
21
22
  from zipfile import ZipFile
22
23
 
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
30
31
  from playwright._impl._errors import TargetClosedError
31
32
  from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
32
33
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
33
- from playwright_stealth import stealth_async # type: ignore[import-untyped]
34
+ from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
34
35
  from puremagic import PureError, from_string # type: ignore[import-untyped]
35
36
  from w3lib.html import strip_html5_whitespace
36
37
  from w3lib.url import canonicalize_url, safe_url_string
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
95
96
  return msg, kwargs
96
97
 
97
98
 
99
+ @dataclass
100
+ class PCStealthConfig(StealthConfig): # type: ignore[misc]
101
+
102
+ @property
103
+ def enabled_scripts(self) -> Generator[str, None, None]:
104
+ self.webdriver = True
105
+ self.webgl_vendor = True
106
+ self.chrome_app = True
107
+ self.chrome_csi = True
108
+ self.chrome_load_times = True
109
+ self.chrome_runtime = True
110
+ self.iframe_content_window = True
111
+ self.media_codecs = True
112
+ self.navigator_hardware_concurrency = 4
113
+ self.navigator_languages = False # Causes issue
114
+ self.navigator_permissions = True
115
+ self.navigator_platform = True
116
+ self.navigator_plugins = True
117
+ self.navigator_user_agent = False # Causes issues
118
+ self.navigator_vendor = False # Causes issues
119
+ self.outerdimensions = True
120
+ self.hairline = True
121
+ yield from super().enabled_scripts
122
+
123
+
98
124
  class Capture():
99
125
 
100
126
  _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
@@ -171,7 +197,8 @@ class Capture():
171
197
  raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
172
198
 
173
199
  self.browser = await self.playwright[self.browser_name].launch(
174
- proxy=self.proxy if self.proxy else None
200
+ proxy=self.proxy if self.proxy else None,
201
+ # headless=False
175
202
  )
176
203
 
177
204
  # Set of URLs that were captured in that context
@@ -385,6 +412,7 @@ class Capture():
385
412
  self.context = await self.browser.new_context(
386
413
  record_har_path=self._temp_harfile.name,
387
414
  ignore_https_errors=True,
415
+ bypass_csp=True,
388
416
  http_credentials=self.http_credentials if self.http_credentials else None,
389
417
  user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
390
418
  locale=self.locale if self.locale else None,
@@ -472,7 +500,7 @@ class Capture():
472
500
  async def handler() -> None:
473
501
  self.logger.debug('Didomi dialog found, clicking through.')
474
502
  if await page.locator("#didomi-notice-agree-button").is_visible():
475
- await page.locator("#didomi-notice-agree-button").click(timeout=2000)
503
+ await page.locator("#didomi-notice-agree-button").click(timeout=30000)
476
504
 
477
505
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
478
506
  self.logger.info('Didomi handler added')
@@ -654,6 +682,7 @@ class Capture():
654
682
  capturing_sub = False
655
683
  try:
656
684
  page = await self.context.new_page()
685
+ await page.clock.install()
657
686
  except Error as e:
658
687
  self.logger.warning(f'The context is in a broken state: {e}')
659
688
  self.should_retry = True
@@ -671,7 +700,9 @@ class Capture():
671
700
  await self.__dialog_alert_dialog_clickthrough(page)
672
701
  await self.__dialog_clickthrough(page)
673
702
 
674
- await stealth_async(page)
703
+ await stealth_async(page, PCStealthConfig())
704
+ # await stealth_async(page)
705
+
675
706
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
676
707
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
677
708
  page.on("requestfinished", store_request)
@@ -718,6 +749,7 @@ class Capture():
718
749
  else:
719
750
  raise initial_error
720
751
  else:
752
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
721
753
  try:
722
754
  await page.bring_to_front()
723
755
  self.logger.debug('Page moved to front.')
@@ -753,20 +785,31 @@ class Capture():
753
785
  # check if we have anything on the page. If we don't, the page is not working properly.
754
786
  if await self._failsafe_get_content(page):
755
787
  self.logger.debug('Got rendered content')
788
+
789
+ # move mouse
790
+ await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
791
+ self.logger.debug('Moved mouse.')
792
+ await self._wait_for_random_timeout(page, 5)
793
+ self.logger.debug('Keep going after moving mouse.')
794
+
756
795
  if allow_tracking:
757
- await self._wait_for_random_timeout(page, 2)
796
+ await self._wait_for_random_timeout(page, 5)
758
797
  # This event is required trigger the add_locator_handler
759
798
  try:
760
799
  if await page.locator("body").first.is_visible():
761
- await page.locator("body").first.click(button="right", timeout=5000)
800
+ self.logger.debug('Got body.')
801
+ await page.locator("body").first.click(button="right",
802
+ timeout=5000,
803
+ delay=50)
804
+ self.logger.debug('Clicked on body.')
762
805
  except Exception as e:
763
806
  self.logger.warning(f'Could not find body: {e}')
764
807
 
765
- # move mouse
766
- await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
767
- self.logger.debug('Moved mouse.')
768
- await self._wait_for_random_timeout(page, 2)
769
- self.logger.debug('Keep going after moving mouse.')
808
+ # fast forward 30s
809
+ await page.clock.run_for(10000)
810
+ await page.clock.resume()
811
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec
812
+ self.logger.warning('Moved time forward.')
770
813
 
771
814
  if parsed_url.fragment:
772
815
  # We got a fragment, make sure we go to it and scroll only a little bit.
@@ -774,7 +817,8 @@ class Capture():
774
817
  try:
775
818
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
776
819
  await self._wait_for_random_timeout(page, 2)
777
- await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
820
+ async with timeout(3):
821
+ await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
778
822
  self.logger.debug('Jumped to fragment.')
779
823
  except PlaywrightTimeoutError as e:
780
824
  self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
@@ -782,14 +826,20 @@ class Capture():
782
826
  self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
783
827
  except Error as e:
784
828
  self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
829
+ except TimeoutError:
830
+ self.logger.debug('Unable to scroll due to timeout')
785
831
  else:
786
832
  # scroll more
787
833
  try:
788
- # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
789
- await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
834
+ # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
835
+ # 2024-07-08: Also, it sometimes get stuck.
836
+ async with timeout(3):
837
+ await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
790
838
  self.logger.debug('Scrolled down.')
791
839
  except Error as e:
792
840
  self.logger.debug(f'Unable to scroll: {e}')
841
+ except TimeoutError:
842
+ self.logger.debug('Unable to scroll due to timeout')
793
843
 
794
844
  await self._wait_for_random_timeout(page, 3)
795
845
  self.logger.debug('Keep going after moving on page.')
@@ -802,7 +852,6 @@ class Capture():
802
852
  self.logger.debug('PageDown on keyboard')
803
853
  except Error as e:
804
854
  self.logger.debug(f'Unable to use keyboard: {e}')
805
-
806
855
  if self.wait_for_download > 0:
807
856
  self.logger.info('Waiting for download to finish...')
808
857
  await self._safe_wait(page, 20)
@@ -993,12 +1042,12 @@ class Capture():
993
1042
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
994
1043
  self.logger.debug("Capturing a screenshot of the full page.")
995
1044
  try:
996
- return await page.screenshot(full_page=True, timeout=5000)
1045
+ return await page.screenshot(full_page=True, timeout=10000)
997
1046
  except Error as e:
998
1047
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
999
1048
 
1000
1049
  try:
1001
- return await page.screenshot(full_page=True, scale="css", timeout=5000)
1050
+ return await page.screenshot(full_page=True, scale="css", timeout=30000)
1002
1051
  except Error as e:
1003
1052
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
1004
1053
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.0"
3
+ version = "1.25.1"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
28
28
  pytz = {"version" = "^2024.1", python = "<3.9"}
29
29
  tzdata = "^2024.1"
30
30
  playwright-stealth = "^1.0.6"
31
- setuptools = "^70.2.0"
31
+ setuptools = "^70.3.0"
32
32
  puremagic = "^1.25"
33
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
34
  aiohttp = {extras = ["speedups"], version = "^3.9.5"}