PlaywrightCapture 1.24.11__py3-none-any.whl → 1.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +87 -28
- {playwrightcapture-1.24.11.dist-info → playwrightcapture-1.25.1.dist-info}/METADATA +5 -5
- playwrightcapture-1.25.1.dist-info/RECORD +9 -0
- playwrightcapture-1.24.11.dist-info/RECORD +0 -9
- {playwrightcapture-1.24.11.dist-info → playwrightcapture-1.25.1.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.24.11.dist-info → playwrightcapture-1.25.1.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -13,10 +13,11 @@ import sys
|
|
13
13
|
import time
|
14
14
|
|
15
15
|
from base64 import b64decode
|
16
|
+
from dataclasses import dataclass
|
16
17
|
from io import BytesIO
|
17
18
|
from logging import LoggerAdapter, Logger
|
18
19
|
from tempfile import NamedTemporaryFile
|
19
|
-
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
|
20
|
+
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
|
20
21
|
from urllib.parse import urlparse, unquote, urljoin
|
21
22
|
from zipfile import ZipFile
|
22
23
|
|
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
|
|
30
31
|
from playwright._impl._errors import TargetClosedError
|
31
32
|
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
|
32
33
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
33
|
-
from playwright_stealth import stealth_async # type: ignore[import-untyped]
|
34
|
+
from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
|
34
35
|
from puremagic import PureError, from_string # type: ignore[import-untyped]
|
35
36
|
from w3lib.html import strip_html5_whitespace
|
36
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
95
96
|
return msg, kwargs
|
96
97
|
|
97
98
|
|
99
|
+
@dataclass
|
100
|
+
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
101
|
+
|
102
|
+
@property
|
103
|
+
def enabled_scripts(self) -> Generator[str, None, None]:
|
104
|
+
self.webdriver = True
|
105
|
+
self.webgl_vendor = True
|
106
|
+
self.chrome_app = True
|
107
|
+
self.chrome_csi = True
|
108
|
+
self.chrome_load_times = True
|
109
|
+
self.chrome_runtime = True
|
110
|
+
self.iframe_content_window = True
|
111
|
+
self.media_codecs = True
|
112
|
+
self.navigator_hardware_concurrency = 4
|
113
|
+
self.navigator_languages = False # Causes issue
|
114
|
+
self.navigator_permissions = True
|
115
|
+
self.navigator_platform = True
|
116
|
+
self.navigator_plugins = True
|
117
|
+
self.navigator_user_agent = False # Causes issues
|
118
|
+
self.navigator_vendor = False # Causes issues
|
119
|
+
self.outerdimensions = True
|
120
|
+
self.hairline = True
|
121
|
+
yield from super().enabled_scripts
|
122
|
+
|
123
|
+
|
98
124
|
class Capture():
|
99
125
|
|
100
126
|
_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
|
@@ -171,7 +197,8 @@ class Capture():
|
|
171
197
|
raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
|
172
198
|
|
173
199
|
self.browser = await self.playwright[self.browser_name].launch(
|
174
|
-
proxy=self.proxy if self.proxy else None
|
200
|
+
proxy=self.proxy if self.proxy else None,
|
201
|
+
# headless=False
|
175
202
|
)
|
176
203
|
|
177
204
|
# Set of URLs that were captured in that context
|
@@ -385,6 +412,7 @@ class Capture():
|
|
385
412
|
self.context = await self.browser.new_context(
|
386
413
|
record_har_path=self._temp_harfile.name,
|
387
414
|
ignore_https_errors=True,
|
415
|
+
bypass_csp=True,
|
388
416
|
http_credentials=self.http_credentials if self.http_credentials else None,
|
389
417
|
user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
|
390
418
|
locale=self.locale if self.locale else None,
|
@@ -472,7 +500,7 @@ class Capture():
|
|
472
500
|
async def handler() -> None:
|
473
501
|
self.logger.debug('Didomi dialog found, clicking through.')
|
474
502
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
475
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
503
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=30000)
|
476
504
|
|
477
505
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
478
506
|
self.logger.info('Didomi handler added')
|
@@ -521,7 +549,7 @@ class Capture():
|
|
521
549
|
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
522
550
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
523
551
|
else:
|
524
|
-
self.logger.info('Consent window found, but no button to click through.')
|
552
|
+
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
525
553
|
|
526
554
|
await page.add_locator_handler(
|
527
555
|
page.get_by_role("alertdialog").last,
|
@@ -542,7 +570,7 @@ class Capture():
|
|
542
570
|
self.logger.info('Consent window found, clicking through.')
|
543
571
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
544
572
|
else:
|
545
|
-
self.logger.info('Consent window found, but no button to click through.')
|
573
|
+
self.logger.info('Consent window found (dialog), but no button to click through.')
|
546
574
|
await page.add_locator_handler(
|
547
575
|
page.get_by_role("dialog").last,
|
548
576
|
handler,
|
@@ -584,7 +612,7 @@ class Capture():
|
|
584
612
|
handler,
|
585
613
|
times=1, no_wait_after=True
|
586
614
|
)
|
587
|
-
self.logger.info('
|
615
|
+
self.logger.info('Piwik handler added')
|
588
616
|
|
589
617
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
590
618
|
referer: str | None=None,
|
@@ -654,6 +682,7 @@ class Capture():
|
|
654
682
|
capturing_sub = False
|
655
683
|
try:
|
656
684
|
page = await self.context.new_page()
|
685
|
+
await page.clock.install()
|
657
686
|
except Error as e:
|
658
687
|
self.logger.warning(f'The context is in a broken state: {e}')
|
659
688
|
self.should_retry = True
|
@@ -671,7 +700,9 @@ class Capture():
|
|
671
700
|
await self.__dialog_alert_dialog_clickthrough(page)
|
672
701
|
await self.__dialog_clickthrough(page)
|
673
702
|
|
674
|
-
await stealth_async(page)
|
703
|
+
await stealth_async(page, PCStealthConfig())
|
704
|
+
# await stealth_async(page)
|
705
|
+
|
675
706
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
676
707
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
677
708
|
page.on("requestfinished", store_request)
|
@@ -718,11 +749,11 @@ class Capture():
|
|
718
749
|
else:
|
719
750
|
raise initial_error
|
720
751
|
else:
|
752
|
+
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
721
753
|
try:
|
722
754
|
await page.bring_to_front()
|
723
755
|
self.logger.debug('Page moved to front.')
|
724
756
|
except Error as e:
|
725
|
-
self.should_retry = True
|
726
757
|
self.logger.warning('Page in a broken state.')
|
727
758
|
raise e
|
728
759
|
|
@@ -754,25 +785,40 @@ class Capture():
|
|
754
785
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
755
786
|
if await self._failsafe_get_content(page):
|
756
787
|
self.logger.debug('Got rendered content')
|
757
|
-
if allow_tracking:
|
758
|
-
await self._wait_for_random_timeout(page, 2)
|
759
|
-
# This event is required trigger the add_locator_handler
|
760
|
-
if await page.locator("body").first.is_visible():
|
761
|
-
await page.locator("body").first.click(button="right", timeout=2000)
|
762
788
|
|
763
789
|
# move mouse
|
764
790
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
765
791
|
self.logger.debug('Moved mouse.')
|
766
|
-
await self._wait_for_random_timeout(page,
|
792
|
+
await self._wait_for_random_timeout(page, 5)
|
767
793
|
self.logger.debug('Keep going after moving mouse.')
|
768
794
|
|
795
|
+
if allow_tracking:
|
796
|
+
await self._wait_for_random_timeout(page, 5)
|
797
|
+
# This event is required trigger the add_locator_handler
|
798
|
+
try:
|
799
|
+
if await page.locator("body").first.is_visible():
|
800
|
+
self.logger.debug('Got body.')
|
801
|
+
await page.locator("body").first.click(button="right",
|
802
|
+
timeout=5000,
|
803
|
+
delay=50)
|
804
|
+
self.logger.debug('Clicked on body.')
|
805
|
+
except Exception as e:
|
806
|
+
self.logger.warning(f'Could not find body: {e}')
|
807
|
+
|
808
|
+
# fast forward 30s
|
809
|
+
await page.clock.run_for(10000)
|
810
|
+
await page.clock.resume()
|
811
|
+
await self._wait_for_random_timeout(page, 5) # Wait 5 sec
|
812
|
+
self.logger.warning('Moved time forward.')
|
813
|
+
|
769
814
|
if parsed_url.fragment:
|
770
815
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
771
816
|
fragment = unquote(parsed_url.fragment)
|
772
817
|
try:
|
773
818
|
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
774
819
|
await self._wait_for_random_timeout(page, 2)
|
775
|
-
|
820
|
+
async with timeout(3):
|
821
|
+
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
776
822
|
self.logger.debug('Jumped to fragment.')
|
777
823
|
except PlaywrightTimeoutError as e:
|
778
824
|
self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
|
@@ -780,14 +826,20 @@ class Capture():
|
|
780
826
|
self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
|
781
827
|
except Error as e:
|
782
828
|
self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
|
829
|
+
except TimeoutError:
|
830
|
+
self.logger.debug('Unable to scroll due to timeout')
|
783
831
|
else:
|
784
832
|
# scroll more
|
785
833
|
try:
|
786
|
-
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
|
787
|
-
|
834
|
+
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
|
835
|
+
# 2024-07-08: Also, it sometimes get stuck.
|
836
|
+
async with timeout(3):
|
837
|
+
await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
|
788
838
|
self.logger.debug('Scrolled down.')
|
789
839
|
except Error as e:
|
790
840
|
self.logger.debug(f'Unable to scroll: {e}')
|
841
|
+
except TimeoutError:
|
842
|
+
self.logger.debug('Unable to scroll due to timeout')
|
791
843
|
|
792
844
|
await self._wait_for_random_timeout(page, 3)
|
793
845
|
self.logger.debug('Keep going after moving on page.')
|
@@ -800,7 +852,6 @@ class Capture():
|
|
800
852
|
self.logger.debug('PageDown on keyboard')
|
801
853
|
except Error as e:
|
802
854
|
self.logger.debug(f'Unable to use keyboard: {e}')
|
803
|
-
|
804
855
|
if self.wait_for_download > 0:
|
805
856
|
self.logger.info('Waiting for download to finish...')
|
806
857
|
await self._safe_wait(page, 20)
|
@@ -920,19 +971,20 @@ class Capture():
|
|
920
971
|
'Navigation interrupted by another one',
|
921
972
|
'Navigation failed because page was closed!',
|
922
973
|
'Target page, context or browser has been closed',
|
923
|
-
'Protocol error (Page.bringToFront): Not attached to an active page',
|
924
974
|
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
925
975
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
926
976
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
927
977
|
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
928
|
-
'Peer sent fatal TLS alert:
|
978
|
+
'Peer sent fatal TLS alert: Handshake failed',
|
929
979
|
'Peer sent fatal TLS alert: Internal error',
|
980
|
+
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
930
981
|
'Load cannot follow more than 20 redirections',
|
931
982
|
'Page crashed',
|
932
983
|
'Error receiving data: Connection reset by peer',
|
933
984
|
'Internal SOCKSv5 proxy server error.',
|
934
985
|
'Host unreachable through SOCKSv5 server.',
|
935
|
-
'HTTP/2 Error: NO_ERROR'
|
986
|
+
'HTTP/2 Error: NO_ERROR',
|
987
|
+
'HTTP/2 Error: PROTOCOL_ERROR']:
|
936
988
|
# Other errors, let's give it another shot
|
937
989
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
938
990
|
self.should_retry = True
|
@@ -942,16 +994,18 @@ class Capture():
|
|
942
994
|
self.should_retry = True
|
943
995
|
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
944
996
|
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
945
|
-
'net::
|
946
|
-
|
997
|
+
'net::ERR_CERT_DATE_INVALID',
|
998
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH',
|
999
|
+
'net::ERR_UNSAFE_PORT']:
|
1000
|
+
# No need to retry, the credentials/certs are wrong/missing.
|
947
1001
|
pass
|
948
|
-
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
1002
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
|
949
1003
|
self.should_retry = True
|
950
1004
|
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
951
1005
|
pass
|
952
1006
|
else:
|
953
1007
|
# Unexpected ones
|
954
|
-
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
1008
|
+
self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
|
955
1009
|
except Exception as e:
|
956
1010
|
# we may get a non-playwright exception to.
|
957
1011
|
# The ones we try to handle here should be treated as if they were.
|
@@ -988,12 +1042,12 @@ class Capture():
|
|
988
1042
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
989
1043
|
self.logger.debug("Capturing a screenshot of the full page.")
|
990
1044
|
try:
|
991
|
-
return await page.screenshot(full_page=True, timeout=
|
1045
|
+
return await page.screenshot(full_page=True, timeout=10000)
|
992
1046
|
except Error as e:
|
993
1047
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
|
994
1048
|
|
995
1049
|
try:
|
996
|
-
return await page.screenshot(full_page=True, scale="css", timeout=
|
1050
|
+
return await page.screenshot(full_page=True, scale="css", timeout=30000)
|
997
1051
|
except Error as e:
|
998
1052
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
|
999
1053
|
|
@@ -1194,8 +1248,13 @@ class Capture():
|
|
1194
1248
|
'net::ERR_EMPTY_RESPONSE',
|
1195
1249
|
'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
|
1196
1250
|
'net::ERR_HTTP2_PROTOCOL_ERROR',
|
1251
|
+
'net::ERR_INVALID_REDIRECT',
|
1197
1252
|
'net::ERR_INVALID_RESPONSE',
|
1198
1253
|
'net::ERR_NAME_NOT_RESOLVED',
|
1254
|
+
'net::ERR_NETWORK_ACCESS_DENIED',
|
1255
|
+
'net::ERR_QUIC_PROTOCOL_ERROR',
|
1256
|
+
'net::ERR_RESPONSE_HEADERS_TRUNCATED',
|
1257
|
+
'net::ERR_SOCKET_NOT_CONNECTED',
|
1199
1258
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1200
1259
|
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1201
1260
|
'net::ERR_SSL_PROTOCOL_ERROR',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.25.1
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -26,14 +26,14 @@ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
|
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.45.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.25,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=70.
|
34
|
+
Requires-Dist: setuptools (>=70.3.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
|
-
Requires-Dist: w3lib (>=2.1
|
36
|
+
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
38
38
|
Description-Content-Type: text/markdown
|
39
39
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=dIsqmpBiAfkjv400Pj_WtC_69se19ARcW3eDx0LTiuI,69525
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.25.1.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.1.dist-info/METADATA,sha256=PAPj-xddph1jI5cah7404Cqu2_6xvwHq4BKotq-Gyhk,3173
|
8
|
+
playwrightcapture-1.25.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.1.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=bEbKQKnUT4mPXzgQF8NI6hKK1LmVcBnt2MXB9BXNwgQ,66759
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.24.11.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.24.11.dist-info/METADATA,sha256=c5hVyRttyjdcwwG-CVfDqmqm_a33VWePttyxKtaBJyw,3174
|
8
|
-
playwrightcapture-1.24.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
-
playwrightcapture-1.24.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|