PlaywrightCapture 1.24.11__py3-none-any.whl → 1.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,10 +13,11 @@ import sys
13
13
  import time
14
14
 
15
15
  from base64 import b64decode
16
+ from dataclasses import dataclass
16
17
  from io import BytesIO
17
18
  from logging import LoggerAdapter, Logger
18
19
  from tempfile import NamedTemporaryFile
19
- from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
20
+ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
20
21
  from urllib.parse import urlparse, unquote, urljoin
21
22
  from zipfile import ZipFile
22
23
 
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
30
31
  from playwright._impl._errors import TargetClosedError
31
32
  from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
32
33
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
33
- from playwright_stealth import stealth_async # type: ignore[import-untyped]
34
+ from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
34
35
  from puremagic import PureError, from_string # type: ignore[import-untyped]
35
36
  from w3lib.html import strip_html5_whitespace
36
37
  from w3lib.url import canonicalize_url, safe_url_string
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
95
96
  return msg, kwargs
96
97
 
97
98
 
99
+ @dataclass
100
+ class PCStealthConfig(StealthConfig): # type: ignore[misc]
101
+
102
+ @property
103
+ def enabled_scripts(self) -> Generator[str, None, None]:
104
+ self.webdriver = True
105
+ self.webgl_vendor = True
106
+ self.chrome_app = True
107
+ self.chrome_csi = True
108
+ self.chrome_load_times = True
109
+ self.chrome_runtime = True
110
+ self.iframe_content_window = True
111
+ self.media_codecs = True
112
+ self.navigator_hardware_concurrency = 4
113
+ self.navigator_languages = False # Causes issue
114
+ self.navigator_permissions = True
115
+ self.navigator_platform = True
116
+ self.navigator_plugins = True
117
+ self.navigator_user_agent = False # Causes issues
118
+ self.navigator_vendor = False # Causes issues
119
+ self.outerdimensions = True
120
+ self.hairline = True
121
+ yield from super().enabled_scripts
122
+
123
+
98
124
  class Capture():
99
125
 
100
126
  _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
@@ -171,7 +197,8 @@ class Capture():
171
197
  raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
172
198
 
173
199
  self.browser = await self.playwright[self.browser_name].launch(
174
- proxy=self.proxy if self.proxy else None
200
+ proxy=self.proxy if self.proxy else None,
201
+ # headless=False
175
202
  )
176
203
 
177
204
  # Set of URLs that were captured in that context
@@ -385,6 +412,7 @@ class Capture():
385
412
  self.context = await self.browser.new_context(
386
413
  record_har_path=self._temp_harfile.name,
387
414
  ignore_https_errors=True,
415
+ bypass_csp=True,
388
416
  http_credentials=self.http_credentials if self.http_credentials else None,
389
417
  user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
390
418
  locale=self.locale if self.locale else None,
@@ -472,7 +500,7 @@ class Capture():
472
500
  async def handler() -> None:
473
501
  self.logger.debug('Didomi dialog found, clicking through.')
474
502
  if await page.locator("#didomi-notice-agree-button").is_visible():
475
- await page.locator("#didomi-notice-agree-button").click(timeout=2000)
503
+ await page.locator("#didomi-notice-agree-button").click(timeout=30000)
476
504
 
477
505
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
478
506
  self.logger.info('Didomi handler added')
@@ -521,7 +549,7 @@ class Capture():
521
549
  elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
522
550
  await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
523
551
  else:
524
- self.logger.info('Consent window found, but no button to click through.')
552
+ self.logger.info('Consent window found (alert dialog), but no button to click through.')
525
553
 
526
554
  await page.add_locator_handler(
527
555
  page.get_by_role("alertdialog").last,
@@ -542,7 +570,7 @@ class Capture():
542
570
  self.logger.info('Consent window found, clicking through.')
543
571
  await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
544
572
  else:
545
- self.logger.info('Consent window found, but no button to click through.')
573
+ self.logger.info('Consent window found (dialog), but no button to click through.')
546
574
  await page.add_locator_handler(
547
575
  page.get_by_role("dialog").last,
548
576
  handler,
@@ -584,7 +612,7 @@ class Capture():
584
612
  handler,
585
613
  times=1, no_wait_after=True
586
614
  )
587
- self.logger.info('Yahoo handler added')
615
+ self.logger.info('Piwik handler added')
588
616
 
589
617
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
590
618
  referer: str | None=None,
@@ -654,6 +682,7 @@ class Capture():
654
682
  capturing_sub = False
655
683
  try:
656
684
  page = await self.context.new_page()
685
+ await page.clock.install()
657
686
  except Error as e:
658
687
  self.logger.warning(f'The context is in a broken state: {e}')
659
688
  self.should_retry = True
@@ -671,7 +700,9 @@ class Capture():
671
700
  await self.__dialog_alert_dialog_clickthrough(page)
672
701
  await self.__dialog_clickthrough(page)
673
702
 
674
- await stealth_async(page)
703
+ await stealth_async(page, PCStealthConfig())
704
+ # await stealth_async(page)
705
+
675
706
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
676
707
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
677
708
  page.on("requestfinished", store_request)
@@ -718,11 +749,11 @@ class Capture():
718
749
  else:
719
750
  raise initial_error
720
751
  else:
752
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
721
753
  try:
722
754
  await page.bring_to_front()
723
755
  self.logger.debug('Page moved to front.')
724
756
  except Error as e:
725
- self.should_retry = True
726
757
  self.logger.warning('Page in a broken state.')
727
758
  raise e
728
759
 
@@ -754,25 +785,40 @@ class Capture():
754
785
  # check if we have anything on the page. If we don't, the page is not working properly.
755
786
  if await self._failsafe_get_content(page):
756
787
  self.logger.debug('Got rendered content')
757
- if allow_tracking:
758
- await self._wait_for_random_timeout(page, 2)
759
- # This event is required trigger the add_locator_handler
760
- if await page.locator("body").first.is_visible():
761
- await page.locator("body").first.click(button="right", timeout=2000)
762
788
 
763
789
  # move mouse
764
790
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
765
791
  self.logger.debug('Moved mouse.')
766
- await self._wait_for_random_timeout(page, 2)
792
+ await self._wait_for_random_timeout(page, 5)
767
793
  self.logger.debug('Keep going after moving mouse.')
768
794
 
795
+ if allow_tracking:
796
+ await self._wait_for_random_timeout(page, 5)
797
+ # This event is required trigger the add_locator_handler
798
+ try:
799
+ if await page.locator("body").first.is_visible():
800
+ self.logger.debug('Got body.')
801
+ await page.locator("body").first.click(button="right",
802
+ timeout=5000,
803
+ delay=50)
804
+ self.logger.debug('Clicked on body.')
805
+ except Exception as e:
806
+ self.logger.warning(f'Could not find body: {e}')
807
+
808
+ # fast forward 30s
809
+ await page.clock.run_for(10000)
810
+ await page.clock.resume()
811
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec
812
+ self.logger.warning('Moved time forward.')
813
+
769
814
  if parsed_url.fragment:
770
815
  # We got a fragment, make sure we go to it and scroll only a little bit.
771
816
  fragment = unquote(parsed_url.fragment)
772
817
  try:
773
818
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
774
819
  await self._wait_for_random_timeout(page, 2)
775
- await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
820
+ async with timeout(3):
821
+ await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
776
822
  self.logger.debug('Jumped to fragment.')
777
823
  except PlaywrightTimeoutError as e:
778
824
  self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
@@ -780,14 +826,20 @@ class Capture():
780
826
  self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
781
827
  except Error as e:
782
828
  self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
829
+ except TimeoutError:
830
+ self.logger.debug('Unable to scroll due to timeout')
783
831
  else:
784
832
  # scroll more
785
833
  try:
786
- # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
787
- await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
834
+ # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
835
+ # 2024-07-08: Also, it sometimes get stuck.
836
+ async with timeout(3):
837
+ await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
788
838
  self.logger.debug('Scrolled down.')
789
839
  except Error as e:
790
840
  self.logger.debug(f'Unable to scroll: {e}')
841
+ except TimeoutError:
842
+ self.logger.debug('Unable to scroll due to timeout')
791
843
 
792
844
  await self._wait_for_random_timeout(page, 3)
793
845
  self.logger.debug('Keep going after moving on page.')
@@ -800,7 +852,6 @@ class Capture():
800
852
  self.logger.debug('PageDown on keyboard')
801
853
  except Error as e:
802
854
  self.logger.debug(f'Unable to use keyboard: {e}')
803
-
804
855
  if self.wait_for_download > 0:
805
856
  self.logger.info('Waiting for download to finish...')
806
857
  await self._safe_wait(page, 20)
@@ -920,19 +971,20 @@ class Capture():
920
971
  'Navigation interrupted by another one',
921
972
  'Navigation failed because page was closed!',
922
973
  'Target page, context or browser has been closed',
923
- 'Protocol error (Page.bringToFront): Not attached to an active page',
924
974
  'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
925
975
  'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
926
976
  'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
927
977
  'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
928
- 'Peer sent fatal TLS alert: The server name sent was not recognized',
978
+ 'Peer sent fatal TLS alert: Handshake failed',
929
979
  'Peer sent fatal TLS alert: Internal error',
980
+ 'Peer sent fatal TLS alert: The server name sent was not recognized',
930
981
  'Load cannot follow more than 20 redirections',
931
982
  'Page crashed',
932
983
  'Error receiving data: Connection reset by peer',
933
984
  'Internal SOCKSv5 proxy server error.',
934
985
  'Host unreachable through SOCKSv5 server.',
935
- 'HTTP/2 Error: NO_ERROR']:
986
+ 'HTTP/2 Error: NO_ERROR',
987
+ 'HTTP/2 Error: PROTOCOL_ERROR']:
936
988
  # Other errors, let's give it another shot
937
989
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
938
990
  self.should_retry = True
@@ -942,16 +994,18 @@ class Capture():
942
994
  self.should_retry = True
943
995
  elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
944
996
  'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
945
- 'net::ERR_UNEXPECTED_PROXY_AUTH']:
946
- # No need to retry, the credentials are wrong/missing.
997
+ 'net::ERR_CERT_DATE_INVALID',
998
+ 'net::ERR_UNEXPECTED_PROXY_AUTH',
999
+ 'net::ERR_UNSAFE_PORT']:
1000
+ # No need to retry, the credentials/certs are wrong/missing.
947
1001
  pass
948
- elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
1002
+ elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
949
1003
  self.should_retry = True
950
1004
  elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
951
1005
  pass
952
1006
  else:
953
1007
  # Unexpected ones
954
- self.logger.exception(f'Something went poorly with {url}: {e.message}')
1008
+ self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
955
1009
  except Exception as e:
956
1010
  # we may get a non-playwright exception to.
957
1011
  # The ones we try to handle here should be treated as if they were.
@@ -988,12 +1042,12 @@ class Capture():
988
1042
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
989
1043
  self.logger.debug("Capturing a screenshot of the full page.")
990
1044
  try:
991
- return await page.screenshot(full_page=True, timeout=5000)
1045
+ return await page.screenshot(full_page=True, timeout=10000)
992
1046
  except Error as e:
993
1047
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
994
1048
 
995
1049
  try:
996
- return await page.screenshot(full_page=True, scale="css", timeout=5000)
1050
+ return await page.screenshot(full_page=True, scale="css", timeout=30000)
997
1051
  except Error as e:
998
1052
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
999
1053
 
@@ -1194,8 +1248,13 @@ class Capture():
1194
1248
  'net::ERR_EMPTY_RESPONSE',
1195
1249
  'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
1196
1250
  'net::ERR_HTTP2_PROTOCOL_ERROR',
1251
+ 'net::ERR_INVALID_REDIRECT',
1197
1252
  'net::ERR_INVALID_RESPONSE',
1198
1253
  'net::ERR_NAME_NOT_RESOLVED',
1254
+ 'net::ERR_NETWORK_ACCESS_DENIED',
1255
+ 'net::ERR_QUIC_PROTOCOL_ERROR',
1256
+ 'net::ERR_RESPONSE_HEADERS_TRUNCATED',
1257
+ 'net::ERR_SOCKET_NOT_CONNECTED',
1199
1258
  'net::ERR_SOCKS_CONNECTION_FAILED',
1200
1259
  'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
1201
1260
  'net::ERR_SSL_PROTOCOL_ERROR',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.11
3
+ Version: 1.25.1
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -26,14 +26,14 @@ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
26
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
27
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
- Requires-Dist: playwright (>=1.44.0,<2.0.0)
29
+ Requires-Dist: playwright (>=1.45.0,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
- Requires-Dist: puremagic (>=1.23,<2.0)
31
+ Requires-Dist: puremagic (>=1.25,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=70.0.0,<71.0.0)
34
+ Requires-Dist: setuptools (>=70.3.0,<71.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
- Requires-Dist: w3lib (>=2.1.2,<3.0.0)
36
+ Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
38
38
  Description-Content-Type: text/markdown
39
39
 
@@ -0,0 +1,9 @@
1
+ playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
+ playwrightcapture/capture.py,sha256=dIsqmpBiAfkjv400Pj_WtC_69se19ARcW3eDx0LTiuI,69525
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture-1.25.1.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.1.dist-info/METADATA,sha256=PAPj-xddph1jI5cah7404Cqu2_6xvwHq4BKotq-Gyhk,3173
8
+ playwrightcapture-1.25.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=bEbKQKnUT4mPXzgQF8NI6hKK1LmVcBnt2MXB9BXNwgQ,66759
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.24.11.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.24.11.dist-info/METADATA,sha256=c5hVyRttyjdcwwG-CVfDqmqm_a33VWePttyxKtaBJyw,3174
8
- playwrightcapture-1.24.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.24.11.dist-info/RECORD,,