PlaywrightCapture 1.24.11__tar.gz → 1.25.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/PKG-INFO +5 -5
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/playwrightcapture/capture.py +23 -13
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/pyproject.toml +7 -7
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/LICENSE +0 -0
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/README.md +0 -0
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.11 → playwrightcapture-1.25.0}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.25.0
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -26,14 +26,14 @@ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
|
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.45.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.25,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=70.
|
34
|
+
Requires-Dist: setuptools (>=70.2.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
|
-
Requires-Dist: w3lib (>=2.1
|
36
|
+
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
38
38
|
Description-Content-Type: text/markdown
|
39
39
|
|
@@ -521,7 +521,7 @@ class Capture():
|
|
521
521
|
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
522
522
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
523
523
|
else:
|
524
|
-
self.logger.info('Consent window found, but no button to click through.')
|
524
|
+
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
525
525
|
|
526
526
|
await page.add_locator_handler(
|
527
527
|
page.get_by_role("alertdialog").last,
|
@@ -542,7 +542,7 @@ class Capture():
|
|
542
542
|
self.logger.info('Consent window found, clicking through.')
|
543
543
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
544
544
|
else:
|
545
|
-
self.logger.info('Consent window found, but no button to click through.')
|
545
|
+
self.logger.info('Consent window found (dialog), but no button to click through.')
|
546
546
|
await page.add_locator_handler(
|
547
547
|
page.get_by_role("dialog").last,
|
548
548
|
handler,
|
@@ -584,7 +584,7 @@ class Capture():
|
|
584
584
|
handler,
|
585
585
|
times=1, no_wait_after=True
|
586
586
|
)
|
587
|
-
self.logger.info('
|
587
|
+
self.logger.info('Piwik handler added')
|
588
588
|
|
589
589
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
590
590
|
referer: str | None=None,
|
@@ -722,7 +722,6 @@ class Capture():
|
|
722
722
|
await page.bring_to_front()
|
723
723
|
self.logger.debug('Page moved to front.')
|
724
724
|
except Error as e:
|
725
|
-
self.should_retry = True
|
726
725
|
self.logger.warning('Page in a broken state.')
|
727
726
|
raise e
|
728
727
|
|
@@ -757,8 +756,11 @@ class Capture():
|
|
757
756
|
if allow_tracking:
|
758
757
|
await self._wait_for_random_timeout(page, 2)
|
759
758
|
# This event is required trigger the add_locator_handler
|
760
|
-
|
761
|
-
await page.locator("body").first.
|
759
|
+
try:
|
760
|
+
if await page.locator("body").first.is_visible():
|
761
|
+
await page.locator("body").first.click(button="right", timeout=5000)
|
762
|
+
except Exception as e:
|
763
|
+
self.logger.warning(f'Could not find body: {e}')
|
762
764
|
|
763
765
|
# move mouse
|
764
766
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
@@ -920,19 +922,20 @@ class Capture():
|
|
920
922
|
'Navigation interrupted by another one',
|
921
923
|
'Navigation failed because page was closed!',
|
922
924
|
'Target page, context or browser has been closed',
|
923
|
-
'Protocol error (Page.bringToFront): Not attached to an active page',
|
924
925
|
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
925
926
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
926
927
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
927
928
|
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
928
|
-
'Peer sent fatal TLS alert:
|
929
|
+
'Peer sent fatal TLS alert: Handshake failed',
|
929
930
|
'Peer sent fatal TLS alert: Internal error',
|
931
|
+
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
930
932
|
'Load cannot follow more than 20 redirections',
|
931
933
|
'Page crashed',
|
932
934
|
'Error receiving data: Connection reset by peer',
|
933
935
|
'Internal SOCKSv5 proxy server error.',
|
934
936
|
'Host unreachable through SOCKSv5 server.',
|
935
|
-
'HTTP/2 Error: NO_ERROR'
|
937
|
+
'HTTP/2 Error: NO_ERROR',
|
938
|
+
'HTTP/2 Error: PROTOCOL_ERROR']:
|
936
939
|
# Other errors, let's give it another shot
|
937
940
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
938
941
|
self.should_retry = True
|
@@ -942,16 +945,18 @@ class Capture():
|
|
942
945
|
self.should_retry = True
|
943
946
|
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
944
947
|
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
945
|
-
'net::
|
946
|
-
|
948
|
+
'net::ERR_CERT_DATE_INVALID',
|
949
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH',
|
950
|
+
'net::ERR_UNSAFE_PORT']:
|
951
|
+
# No need to retry, the credentials/certs are wrong/missing.
|
947
952
|
pass
|
948
|
-
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
953
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
|
949
954
|
self.should_retry = True
|
950
955
|
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
951
956
|
pass
|
952
957
|
else:
|
953
958
|
# Unexpected ones
|
954
|
-
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
959
|
+
self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
|
955
960
|
except Exception as e:
|
956
961
|
# we may get a non-playwright exception to.
|
957
962
|
# The ones we try to handle here should be treated as if they were.
|
@@ -1194,8 +1199,13 @@ class Capture():
|
|
1194
1199
|
'net::ERR_EMPTY_RESPONSE',
|
1195
1200
|
'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
|
1196
1201
|
'net::ERR_HTTP2_PROTOCOL_ERROR',
|
1202
|
+
'net::ERR_INVALID_REDIRECT',
|
1197
1203
|
'net::ERR_INVALID_RESPONSE',
|
1198
1204
|
'net::ERR_NAME_NOT_RESOLVED',
|
1205
|
+
'net::ERR_NETWORK_ACCESS_DENIED',
|
1206
|
+
'net::ERR_QUIC_PROTOCOL_ERROR',
|
1207
|
+
'net::ERR_RESPONSE_HEADERS_TRUNCATED',
|
1208
|
+
'net::ERR_SOCKET_NOT_CONNECTED',
|
1199
1209
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1200
1210
|
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1201
1211
|
'net::ERR_SSL_PROTOCOL_ERROR',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.
|
3
|
+
version = "1.25.0"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -19,17 +19,17 @@ classifiers=[
|
|
19
19
|
|
20
20
|
[tool.poetry.dependencies]
|
21
21
|
python = "^3.8"
|
22
|
-
playwright = "^1.
|
22
|
+
playwright = "^1.45.0"
|
23
23
|
dateparser = "^1.2.0"
|
24
24
|
beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
25
|
-
w3lib = "^2.1
|
25
|
+
w3lib = "^2.2.1"
|
26
26
|
pydub = {version = "^0.25.1", optional = true}
|
27
27
|
SpeechRecognition = {version = "^3.10.4", optional = true}
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^70.
|
32
|
-
puremagic = "^1.
|
31
|
+
setuptools = "^70.2.0"
|
32
|
+
puremagic = "^1.25"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
35
35
|
aiohttp-socks = "^0.8.4"
|
@@ -42,8 +42,8 @@ optional = true
|
|
42
42
|
|
43
43
|
[tool.poetry.group.dev.dependencies]
|
44
44
|
types-beautifulsoup4 = "^4.12.0.20240511"
|
45
|
-
pytest = "^8.2.
|
46
|
-
mypy = "^1.10.
|
45
|
+
pytest = "^8.2.2"
|
46
|
+
mypy = "^1.10.1"
|
47
47
|
types-dateparser = "^1.2.0.20240420"
|
48
48
|
types-pytz = "^2024.1.0.20240417"
|
49
49
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|