PlaywrightCapture 1.25.13__tar.gz → 1.25.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/PKG-INFO +5 -5
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/playwrightcapture/capture.py +102 -66
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/pyproject.toml +9 -9
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/LICENSE +0 -0
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/README.md +0 -0
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.13 → playwrightcapture-1.25.15}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.15
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -22,16 +22,16 @@ Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
|
24
24
|
Requires-Dist: aiohttp-socks (>=0.9,<0.10)
|
25
|
-
Requires-Dist: aiohttp[speedups] (>=3.10.
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.10.5,<4.0.0)
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.47.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
31
|
Requires-Dist: puremagic (>=1.27,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
|
-
Requires-Dist: pytz (>=2024.
|
34
|
-
Requires-Dist: setuptools (>=
|
33
|
+
Requires-Dist: pytz (>=2024.2,<2025.0) ; python_version < "3.9"
|
34
|
+
Requires-Dist: setuptools (>=74.1.2,<75.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -236,12 +236,12 @@ class Capture():
|
|
236
236
|
|
237
237
|
return self
|
238
238
|
|
239
|
-
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) ->
|
239
|
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
|
240
240
|
if hasattr(self, '_temp_harfile'):
|
241
241
|
os.unlink(self._temp_harfile.name)
|
242
242
|
|
243
243
|
try:
|
244
|
-
await self.browser.close()
|
244
|
+
await self.browser.close(reason="Closing browser at the end of the capture.")
|
245
245
|
except Exception as e:
|
246
246
|
# We may land in a situation where the capture was forcefully closed and the browser is already closed
|
247
247
|
self.logger.info(f'Unable to close browser: {e}')
|
@@ -250,6 +250,7 @@ class Capture():
|
|
250
250
|
except Exception as e:
|
251
251
|
# this should't happen, but just in case it does...
|
252
252
|
self.logger.info(f'Unable to stop playwright: {e}')
|
253
|
+
return True
|
253
254
|
|
254
255
|
@property
|
255
256
|
def locale(self) -> str:
|
@@ -746,7 +747,7 @@ class Capture():
|
|
746
747
|
(time + 5) * 1000))
|
747
748
|
self.logger.debug(f'Moved time forward by ~{time}s.')
|
748
749
|
except (TimeoutError, asyncio.TimeoutError):
|
749
|
-
self.logger.
|
750
|
+
self.logger.info('Unable to move time forward.')
|
750
751
|
|
751
752
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
752
753
|
referer: str | None=None,
|
@@ -1118,61 +1119,19 @@ class Capture():
|
|
1118
1119
|
to_return['error'] = f"The target was closed - {e}"
|
1119
1120
|
self.should_retry = True
|
1120
1121
|
except Error as e:
|
1122
|
+
# NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
|
1123
|
+
# it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
|
1121
1124
|
self._update_exceptions(e)
|
1122
1125
|
to_return['error'] = e.message
|
1123
1126
|
to_return['error_name'] = e.name
|
1124
1127
|
# TODO: check e.message and figure out if it is worth retrying or not.
|
1125
1128
|
# NOTE: e.name is generally (always?) "Error"
|
1126
|
-
if self.
|
1127
|
-
# Expected errors
|
1129
|
+
if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e):
|
1128
1130
|
self.logger.info(f'Unable to process {url}: {e.name}')
|
1129
|
-
|
1130
|
-
self.should_retry = True
|
1131
|
-
elif e.name in ['NS_BINDING_CANCELLED_OLD_LOAD',
|
1132
|
-
'NS_BINDING_ABORTED',
|
1133
|
-
'NS_ERROR_PARSED_DATA_CACHED',
|
1134
|
-
'NS_ERROR_DOCUMENT_NOT_CACHED']:
|
1131
|
+
elif self._retry_network_error(e) or self._retry_browser_error(e):
|
1135
1132
|
# this one sounds like something we can retry...
|
1136
1133
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
1137
1134
|
self.should_retry = True
|
1138
|
-
elif e.name in ['Download is starting',
|
1139
|
-
'Connection closed',
|
1140
|
-
'Connection terminated unexpectedly',
|
1141
|
-
'Navigation interrupted by another one',
|
1142
|
-
'Navigation failed because page was closed!',
|
1143
|
-
'Target page, context or browser has been closed',
|
1144
|
-
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
1145
|
-
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
1146
|
-
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
1147
|
-
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
1148
|
-
'Peer sent fatal TLS alert: Handshake failed',
|
1149
|
-
'Peer sent fatal TLS alert: Internal error',
|
1150
|
-
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
1151
|
-
'Load cannot follow more than 20 redirections',
|
1152
|
-
'Page crashed',
|
1153
|
-
'Error receiving data: Connection reset by peer',
|
1154
|
-
'Internal SOCKSv5 proxy server error.',
|
1155
|
-
'Host unreachable through SOCKSv5 server.',
|
1156
|
-
'HTTP/2 Error: NO_ERROR',
|
1157
|
-
'HTTP/2 Error: PROTOCOL_ERROR']:
|
1158
|
-
# Other errors, let's give it another shot
|
1159
|
-
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
1160
|
-
self.should_retry = True
|
1161
|
-
elif e.name in ['Target page, context or browser has been closed']:
|
1162
|
-
# The browser barfed, let's try again
|
1163
|
-
self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
|
1164
|
-
self.should_retry = True
|
1165
|
-
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
1166
|
-
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
1167
|
-
'net::ERR_CERT_DATE_INVALID',
|
1168
|
-
'net::ERR_UNEXPECTED_PROXY_AUTH',
|
1169
|
-
'net::ERR_UNSAFE_PORT']:
|
1170
|
-
# No need to retry, the credentials/certs are wrong/missing.
|
1171
|
-
pass
|
1172
|
-
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
|
1173
|
-
self.should_retry = True
|
1174
|
-
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
1175
|
-
pass
|
1176
1135
|
else:
|
1177
1136
|
# Unexpected ones
|
1178
1137
|
self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
|
@@ -1196,14 +1155,20 @@ class Capture():
|
|
1196
1155
|
to_return['error'] = f'Unable to get the cookies: {e}'
|
1197
1156
|
# frames_tree = self.make_frame_tree(page.main_frame)
|
1198
1157
|
try:
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1158
|
+
async with timeout(60):
|
1159
|
+
page.remove_listener("requestfinished", store_request)
|
1160
|
+
await page.close(reason="Closing the page because the capture finished.")
|
1161
|
+
self.logger.debug('Page closed.')
|
1162
|
+
await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR
|
1163
|
+
self.logger.debug('Context closed.')
|
1164
|
+
with open(self._temp_harfile.name) as _har:
|
1165
|
+
to_return['har'] = json.load(_har)
|
1166
|
+
self.logger.debug('Got HAR.')
|
1167
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1168
|
+
self.logger.warning("Unable to close page and context at the end of the capture.")
|
1169
|
+
self.should_retry = True
|
1206
1170
|
except Exception as e:
|
1171
|
+
self.logger.warning("Other exception while finishingup the capture: {e}.")
|
1207
1172
|
if 'error' not in to_return:
|
1208
1173
|
to_return['error'] = f'Unable to generate HAR file: {e}'
|
1209
1174
|
self.logger.debug('Capture done')
|
@@ -1212,17 +1177,27 @@ class Capture():
|
|
1212
1177
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
1213
1178
|
self.logger.debug("Capturing a screenshot of the full page.")
|
1214
1179
|
try:
|
1215
|
-
|
1180
|
+
async with timeout(15):
|
1181
|
+
return await page.screenshot(full_page=True, timeout=10000)
|
1182
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1183
|
+
self.logger.info("Screenshot of the full page got stuck, trying to scale it down.")
|
1216
1184
|
except Error as e:
|
1217
1185
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
|
1218
1186
|
|
1219
1187
|
try:
|
1220
|
-
|
1188
|
+
async with timeout(35):
|
1189
|
+
return await page.screenshot(full_page=True, scale="css", timeout=30000)
|
1190
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1191
|
+
self.logger.info("Screenshot of the full page got stuck, trying to get the current viewport only.")
|
1221
1192
|
except Error as e:
|
1222
1193
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
|
1223
1194
|
|
1224
1195
|
try:
|
1225
|
-
|
1196
|
+
async with timeout(10):
|
1197
|
+
return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
|
1198
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
1199
|
+
self.logger.info("Screenshot of the full page got stuck, unable to get any screenshot.")
|
1200
|
+
raise e
|
1226
1201
|
except Error as e:
|
1227
1202
|
self.logger.info(f"Unable to get any screenshot: {e}")
|
1228
1203
|
raise e
|
@@ -1404,12 +1379,66 @@ class Capture():
|
|
1404
1379
|
_, name = exception.message.split(': ', maxsplit=1)
|
1405
1380
|
exception._name = name.strip()
|
1406
1381
|
|
1407
|
-
def
|
1382
|
+
def _retry_browser_error(self, exception: Error) -> bool:
|
1383
|
+
if exception.name in [
|
1384
|
+
'Download is starting',
|
1385
|
+
'Connection closed',
|
1386
|
+
'Connection terminated unexpectedly',
|
1387
|
+
'Navigation interrupted by another one',
|
1388
|
+
'Navigation failed because page was closed!',
|
1389
|
+
'Target page, context or browser has been closed',
|
1390
|
+
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
1391
|
+
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
1392
|
+
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
1393
|
+
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
1394
|
+
'Peer sent fatal TLS alert: Handshake failed',
|
1395
|
+
'Peer sent fatal TLS alert: Internal error',
|
1396
|
+
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
1397
|
+
'Load cannot follow more than 20 redirections',
|
1398
|
+
'Page crashed',
|
1399
|
+
'Error receiving data: Connection reset by peer',
|
1400
|
+
'Internal SOCKSv5 proxy server error.',
|
1401
|
+
'Host unreachable through SOCKSv5 server.',
|
1402
|
+
# The browser barfed
|
1403
|
+
'Target page, context or browser has been closed',
|
1404
|
+
]:
|
1405
|
+
# Other errors, let's give it another shot
|
1406
|
+
return True
|
1407
|
+
elif exception.name and any(msg in exception.name for msg in ['is interrupted by another navigation to',
|
1408
|
+
'Page.bringToFront',
|
1409
|
+
'TypeError']):
|
1410
|
+
# Match on partial string with variable content
|
1411
|
+
return True
|
1412
|
+
return False
|
1413
|
+
|
1414
|
+
def _retry_network_error(self, exception: Error) -> bool:
|
1415
|
+
if exception.name in [
|
1416
|
+
'HTTP/2 Error: NO_ERROR',
|
1417
|
+
'HTTP/2 Error: PROTOCOL_ERROR',
|
1418
|
+
'NS_BINDING_ABORTED',
|
1419
|
+
'NS_BINDING_CANCELLED_OLD_LOAD',
|
1420
|
+
'NS_ERROR_DOCUMENT_NOT_CACHED',
|
1421
|
+
'NS_ERROR_NET_PARTIAL_TRANSFER',
|
1422
|
+
'NS_ERROR_PARSED_DATA_CACHED',
|
1423
|
+
'net::ERR_CONNECTION_RESET',
|
1424
|
+
'net::ERR_EMPTY_RESPONSE',
|
1425
|
+
'net::ERR_INVALID_RESPONSE',
|
1426
|
+
'net::ERR_RESPONSE_HEADERS_TRUNCATED',
|
1427
|
+
'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
|
1428
|
+
]:
|
1429
|
+
return True
|
1430
|
+
return False
|
1431
|
+
|
1432
|
+
def fatal_browser_error(self, exception: Error) -> bool:
|
1433
|
+
if exception.name and any(msg in exception.name for msg in ['Error resolving', 'Could not connect to']):
|
1434
|
+
return True
|
1435
|
+
return False
|
1436
|
+
|
1437
|
+
def _fatal_network_error(self, exception: Error) -> bool:
|
1408
1438
|
if exception.name in [
|
1409
1439
|
'NS_ERROR_ABORT',
|
1410
1440
|
'NS_ERROR_CONNECTION_REFUSED',
|
1411
1441
|
'NS_ERROR_NET_INTERRUPT',
|
1412
|
-
'NS_ERROR_NET_PARTIAL_TRANSFER',
|
1413
1442
|
'NS_ERROR_NET_RESET',
|
1414
1443
|
'NS_ERROR_NET_TIMEOUT',
|
1415
1444
|
'NS_ERROR_REDIRECT_LOOP',
|
@@ -1420,30 +1449,37 @@ class Capture():
|
|
1420
1449
|
'net::ERR_ADDRESS_UNREACHABLE',
|
1421
1450
|
'net::ERR_CONNECTION_CLOSED',
|
1422
1451
|
'net::ERR_CONNECTION_REFUSED',
|
1423
|
-
'net::ERR_CONNECTION_RESET',
|
1424
1452
|
'net::ERR_CONNECTION_TIMED_OUT',
|
1425
|
-
'net::ERR_EMPTY_RESPONSE',
|
1426
1453
|
'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
|
1427
1454
|
'net::ERR_HTTP2_PROTOCOL_ERROR',
|
1428
1455
|
'net::ERR_INVALID_REDIRECT',
|
1429
|
-
'net::ERR_INVALID_RESPONSE',
|
1430
1456
|
'net::ERR_NAME_NOT_RESOLVED',
|
1431
1457
|
'net::ERR_NETWORK_ACCESS_DENIED',
|
1432
1458
|
'net::ERR_QUIC_PROTOCOL_ERROR',
|
1433
|
-
'net::ERR_RESPONSE_HEADERS_TRUNCATED',
|
1434
1459
|
'net::ERR_SOCKET_NOT_CONNECTED',
|
1435
1460
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1436
1461
|
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1437
1462
|
'net::ERR_SSL_PROTOCOL_ERROR',
|
1438
1463
|
'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
|
1439
|
-
'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
|
1440
1464
|
'net::ERR_TIMED_OUT',
|
1441
1465
|
'net::ERR_TOO_MANY_REDIRECTS',
|
1466
|
+
'net::ERR_UNSAFE_PORT',
|
1442
1467
|
'SSL_ERROR_UNKNOWN',
|
1443
1468
|
]:
|
1444
1469
|
return True
|
1445
1470
|
return False
|
1446
1471
|
|
1472
|
+
def _fatal_auth_error(self, exception: Error) -> bool:
|
1473
|
+
if exception.name in [
|
1474
|
+
'net::ERR_INVALID_AUTH_CREDENTIALS',
|
1475
|
+
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
1476
|
+
'net::ERR_CERT_DATE_INVALID',
|
1477
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH',
|
1478
|
+
]:
|
1479
|
+
# No need to retry, the credentials/certs are wrong/missing.
|
1480
|
+
return True
|
1481
|
+
return False
|
1482
|
+
|
1447
1483
|
async def _wait_for_random_timeout(self, page: Page, timeout: int) -> None:
|
1448
1484
|
'''Instead of waiting for the exact same time, we wait +-500ms around the given time. The time is fiven in seconds for simplicity's sake.'''
|
1449
1485
|
if timeout > 1000:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.25.
|
3
|
+
version = "1.25.15"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -19,19 +19,19 @@ classifiers=[
|
|
19
19
|
|
20
20
|
[tool.poetry.dependencies]
|
21
21
|
python = "^3.8"
|
22
|
-
playwright = "^1.
|
22
|
+
playwright = "^1.47.0"
|
23
23
|
dateparser = "^1.2.0"
|
24
24
|
beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
25
25
|
w3lib = "^2.2.1"
|
26
26
|
pydub = {version = "^0.25.1", optional = true}
|
27
27
|
SpeechRecognition = {version = "^3.10.4", optional = true}
|
28
|
-
pytz = {"version" = "^2024.
|
28
|
+
pytz = {"version" = "^2024.2", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^
|
31
|
+
setuptools = "^74.1.2"
|
32
32
|
puremagic = "^1.27"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
|
-
aiohttp = {extras = ["speedups"], version = "^3.10.
|
34
|
+
aiohttp = {extras = ["speedups"], version = "^3.10.5"}
|
35
35
|
aiohttp-socks = "^0.9"
|
36
36
|
|
37
37
|
[tool.poetry.extras]
|
@@ -41,11 +41,11 @@ recaptcha = ["pydub", "SpeechRecognition"]
|
|
41
41
|
optional = true
|
42
42
|
|
43
43
|
[tool.poetry.group.dev.dependencies]
|
44
|
-
types-beautifulsoup4 = "^4.12.0.
|
45
|
-
pytest = "^8.3.
|
46
|
-
mypy = "^1.11.
|
44
|
+
types-beautifulsoup4 = "^4.12.0.20240907"
|
45
|
+
pytest = "^8.3.3"
|
46
|
+
mypy = "^1.11.2"
|
47
47
|
types-dateparser = "^1.2.0.20240420"
|
48
|
-
types-pytz = "^2024.
|
48
|
+
types-pytz = "^2024.2.0.20240913"
|
49
49
|
|
50
50
|
|
51
51
|
[build-system]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|