PlaywrightCapture 1.25.14__py3-none-any.whl → 1.25.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -236,12 +236,12 @@ class Capture():
236
236
 
237
237
  return self
238
238
 
239
- async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> None:
239
+ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
240
240
  if hasattr(self, '_temp_harfile'):
241
241
  os.unlink(self._temp_harfile.name)
242
242
 
243
243
  try:
244
- await self.browser.close()
244
+ await self.browser.close(reason="Closing browser at the end of the capture.")
245
245
  except Exception as e:
246
246
  # We may land in a situation where the capture was forcefully closed and the browser is already closed
247
247
  self.logger.info(f'Unable to close browser: {e}')
@@ -250,6 +250,7 @@ class Capture():
250
250
  except Exception as e:
251
251
  # this should't happen, but just in case it does...
252
252
  self.logger.info(f'Unable to stop playwright: {e}')
253
+ return True
253
254
 
254
255
  @property
255
256
  def locale(self) -> str:
@@ -746,7 +747,7 @@ class Capture():
746
747
  (time + 5) * 1000))
747
748
  self.logger.debug(f'Moved time forward by ~{time}s.')
748
749
  except (TimeoutError, asyncio.TimeoutError):
749
- self.logger.warning('Unable to move time forward.')
750
+ self.logger.info('Unable to move time forward.')
750
751
 
751
752
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
752
753
  referer: str | None=None,
@@ -1118,61 +1119,19 @@ class Capture():
1118
1119
  to_return['error'] = f"The target was closed - {e}"
1119
1120
  self.should_retry = True
1120
1121
  except Error as e:
1122
+ # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
1123
+ # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
1121
1124
  self._update_exceptions(e)
1122
1125
  to_return['error'] = e.message
1123
1126
  to_return['error_name'] = e.name
1124
1127
  # TODO: check e.message and figure out if it is worth retrying or not.
1125
1128
  # NOTE: e.name is generally (always?) "Error"
1126
- if self._exception_is_network_error(e):
1127
- # Expected errors
1129
+ if self._fatal_network_error(e) or self._fatal_auth_error(e) or self.fatal_browser_error(e):
1128
1130
  self.logger.info(f'Unable to process {url}: {e.name}')
1129
- if e.name == 'net::ERR_CONNECTION_RESET':
1130
- self.should_retry = True
1131
- elif e.name in ['NS_BINDING_CANCELLED_OLD_LOAD',
1132
- 'NS_BINDING_ABORTED',
1133
- 'NS_ERROR_PARSED_DATA_CACHED',
1134
- 'NS_ERROR_DOCUMENT_NOT_CACHED']:
1131
+ elif self._retry_network_error(e) or self._retry_browser_error(e):
1135
1132
  # this one sounds like something we can retry...
1136
1133
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
1137
1134
  self.should_retry = True
1138
- elif e.name in ['Download is starting',
1139
- 'Connection closed',
1140
- 'Connection terminated unexpectedly',
1141
- 'Navigation interrupted by another one',
1142
- 'Navigation failed because page was closed!',
1143
- 'Target page, context or browser has been closed',
1144
- 'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
1145
- 'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
1146
- 'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
1147
- 'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
1148
- 'Peer sent fatal TLS alert: Handshake failed',
1149
- 'Peer sent fatal TLS alert: Internal error',
1150
- 'Peer sent fatal TLS alert: The server name sent was not recognized',
1151
- 'Load cannot follow more than 20 redirections',
1152
- 'Page crashed',
1153
- 'Error receiving data: Connection reset by peer',
1154
- 'Internal SOCKSv5 proxy server error.',
1155
- 'Host unreachable through SOCKSv5 server.',
1156
- 'HTTP/2 Error: NO_ERROR',
1157
- 'HTTP/2 Error: PROTOCOL_ERROR']:
1158
- # Other errors, let's give it another shot
1159
- self.logger.info(f'Issue with {url} (retrying): {e.message}')
1160
- self.should_retry = True
1161
- elif e.name in ['Target page, context or browser has been closed']:
1162
- # The browser barfed, let's try again
1163
- self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
1164
- self.should_retry = True
1165
- elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
1166
- 'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
1167
- 'net::ERR_CERT_DATE_INVALID',
1168
- 'net::ERR_UNEXPECTED_PROXY_AUTH',
1169
- 'net::ERR_UNSAFE_PORT']:
1170
- # No need to retry, the credentials/certs are wrong/missing.
1171
- pass
1172
- elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
1173
- self.should_retry = True
1174
- elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
1175
- pass
1176
1135
  else:
1177
1136
  # Unexpected ones
1178
1137
  self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
@@ -1196,14 +1155,20 @@ class Capture():
1196
1155
  to_return['error'] = f'Unable to get the cookies: {e}'
1197
1156
  # frames_tree = self.make_frame_tree(page.main_frame)
1198
1157
  try:
1199
- page.remove_listener("requestfinished", store_request)
1200
- await page.close()
1201
- await self.context.close() # context needs to be closed to generate the HAR
1202
- self.logger.debug('Context closed.')
1203
- with open(self._temp_harfile.name) as _har:
1204
- to_return['har'] = json.load(_har)
1205
- self.logger.debug('Got HAR.')
1158
+ async with timeout(60):
1159
+ page.remove_listener("requestfinished", store_request)
1160
+ await page.close(reason="Closing the page because the capture finished.")
1161
+ self.logger.debug('Page closed.')
1162
+ await self.context.close(reason="Closing the context because the capture finished.") # context needs to be closed to generate the HAR
1163
+ self.logger.debug('Context closed.')
1164
+ with open(self._temp_harfile.name) as _har:
1165
+ to_return['har'] = json.load(_har)
1166
+ self.logger.debug('Got HAR.')
1167
+ except (TimeoutError, asyncio.TimeoutError):
1168
+ self.logger.warning("Unable to close page and context at the end of the capture.")
1169
+ self.should_retry = True
1206
1170
  except Exception as e:
1171
+ self.logger.warning("Other exception while finishingup the capture: {e}.")
1207
1172
  if 'error' not in to_return:
1208
1173
  to_return['error'] = f'Unable to generate HAR file: {e}'
1209
1174
  self.logger.debug('Capture done')
@@ -1212,17 +1177,27 @@ class Capture():
1212
1177
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1213
1178
  self.logger.debug("Capturing a screenshot of the full page.")
1214
1179
  try:
1215
- return await page.screenshot(full_page=True, timeout=10000)
1180
+ async with timeout(15):
1181
+ return await page.screenshot(full_page=True, timeout=10000)
1182
+ except (TimeoutError, asyncio.TimeoutError):
1183
+ self.logger.info("Screenshot of the full page got stuck, trying to scale it down.")
1216
1184
  except Error as e:
1217
1185
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
1218
1186
 
1219
1187
  try:
1220
- return await page.screenshot(full_page=True, scale="css", timeout=30000)
1188
+ async with timeout(35):
1189
+ return await page.screenshot(full_page=True, scale="css", timeout=30000)
1190
+ except (TimeoutError, asyncio.TimeoutError):
1191
+ self.logger.info("Screenshot of the full page got stuck, trying to get the current viewport only.")
1221
1192
  except Error as e:
1222
1193
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
1223
1194
 
1224
1195
  try:
1225
- return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
1196
+ async with timeout(10):
1197
+ return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
1198
+ except (TimeoutError, asyncio.TimeoutError) as e:
1199
+ self.logger.info("Screenshot of the full page got stuck, unable to get any screenshot.")
1200
+ raise e
1226
1201
  except Error as e:
1227
1202
  self.logger.info(f"Unable to get any screenshot: {e}")
1228
1203
  raise e
@@ -1404,12 +1379,66 @@ class Capture():
1404
1379
  _, name = exception.message.split(': ', maxsplit=1)
1405
1380
  exception._name = name.strip()
1406
1381
 
1407
- def _exception_is_network_error(self, exception: Error) -> bool:
1382
+ def _retry_browser_error(self, exception: Error) -> bool:
1383
+ if exception.name in [
1384
+ 'Download is starting',
1385
+ 'Connection closed',
1386
+ 'Connection terminated unexpectedly',
1387
+ 'Navigation interrupted by another one',
1388
+ 'Navigation failed because page was closed!',
1389
+ 'Target page, context or browser has been closed',
1390
+ 'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
1391
+ 'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
1392
+ 'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
1393
+ 'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
1394
+ 'Peer sent fatal TLS alert: Handshake failed',
1395
+ 'Peer sent fatal TLS alert: Internal error',
1396
+ 'Peer sent fatal TLS alert: The server name sent was not recognized',
1397
+ 'Load cannot follow more than 20 redirections',
1398
+ 'Page crashed',
1399
+ 'Error receiving data: Connection reset by peer',
1400
+ 'Internal SOCKSv5 proxy server error.',
1401
+ 'Host unreachable through SOCKSv5 server.',
1402
+ # The browser barfed
1403
+ 'Target page, context or browser has been closed',
1404
+ ]:
1405
+ # Other errors, let's give it another shot
1406
+ return True
1407
+ elif exception.name and any(msg in exception.name for msg in ['is interrupted by another navigation to',
1408
+ 'Page.bringToFront',
1409
+ 'TypeError']):
1410
+ # Match on partial string with variable content
1411
+ return True
1412
+ return False
1413
+
1414
+ def _retry_network_error(self, exception: Error) -> bool:
1415
+ if exception.name in [
1416
+ 'HTTP/2 Error: NO_ERROR',
1417
+ 'HTTP/2 Error: PROTOCOL_ERROR',
1418
+ 'NS_BINDING_ABORTED',
1419
+ 'NS_BINDING_CANCELLED_OLD_LOAD',
1420
+ 'NS_ERROR_DOCUMENT_NOT_CACHED',
1421
+ 'NS_ERROR_NET_PARTIAL_TRANSFER',
1422
+ 'NS_ERROR_PARSED_DATA_CACHED',
1423
+ 'net::ERR_CONNECTION_RESET',
1424
+ 'net::ERR_EMPTY_RESPONSE',
1425
+ 'net::ERR_INVALID_RESPONSE',
1426
+ 'net::ERR_RESPONSE_HEADERS_TRUNCATED',
1427
+ 'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
1428
+ ]:
1429
+ return True
1430
+ return False
1431
+
1432
+ def fatal_browser_error(self, exception: Error) -> bool:
1433
+ if exception.name and any(msg in exception.name for msg in ['Error resolving', 'Could not connect to']):
1434
+ return True
1435
+ return False
1436
+
1437
+ def _fatal_network_error(self, exception: Error) -> bool:
1408
1438
  if exception.name in [
1409
1439
  'NS_ERROR_ABORT',
1410
1440
  'NS_ERROR_CONNECTION_REFUSED',
1411
1441
  'NS_ERROR_NET_INTERRUPT',
1412
- 'NS_ERROR_NET_PARTIAL_TRANSFER',
1413
1442
  'NS_ERROR_NET_RESET',
1414
1443
  'NS_ERROR_NET_TIMEOUT',
1415
1444
  'NS_ERROR_REDIRECT_LOOP',
@@ -1420,30 +1449,37 @@ class Capture():
1420
1449
  'net::ERR_ADDRESS_UNREACHABLE',
1421
1450
  'net::ERR_CONNECTION_CLOSED',
1422
1451
  'net::ERR_CONNECTION_REFUSED',
1423
- 'net::ERR_CONNECTION_RESET',
1424
1452
  'net::ERR_CONNECTION_TIMED_OUT',
1425
- 'net::ERR_EMPTY_RESPONSE',
1426
1453
  'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
1427
1454
  'net::ERR_HTTP2_PROTOCOL_ERROR',
1428
1455
  'net::ERR_INVALID_REDIRECT',
1429
- 'net::ERR_INVALID_RESPONSE',
1430
1456
  'net::ERR_NAME_NOT_RESOLVED',
1431
1457
  'net::ERR_NETWORK_ACCESS_DENIED',
1432
1458
  'net::ERR_QUIC_PROTOCOL_ERROR',
1433
- 'net::ERR_RESPONSE_HEADERS_TRUNCATED',
1434
1459
  'net::ERR_SOCKET_NOT_CONNECTED',
1435
1460
  'net::ERR_SOCKS_CONNECTION_FAILED',
1436
1461
  'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
1437
1462
  'net::ERR_SSL_PROTOCOL_ERROR',
1438
1463
  'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
1439
- 'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
1440
1464
  'net::ERR_TIMED_OUT',
1441
1465
  'net::ERR_TOO_MANY_REDIRECTS',
1466
+ 'net::ERR_UNSAFE_PORT',
1442
1467
  'SSL_ERROR_UNKNOWN',
1443
1468
  ]:
1444
1469
  return True
1445
1470
  return False
1446
1471
 
1472
+ def _fatal_auth_error(self, exception: Error) -> bool:
1473
+ if exception.name in [
1474
+ 'net::ERR_INVALID_AUTH_CREDENTIALS',
1475
+ 'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
1476
+ 'net::ERR_CERT_DATE_INVALID',
1477
+ 'net::ERR_UNEXPECTED_PROXY_AUTH',
1478
+ ]:
1479
+ # No need to retry, the credentials/certs are wrong/missing.
1480
+ return True
1481
+ return False
1482
+
1447
1483
  async def _wait_for_random_timeout(self, page: Page, timeout: int) -> None:
1448
1484
  '''Instead of waiting for the exact same time, we wait +-500ms around the given time. The time is fiven in seconds for simplicity's sake.'''
1449
1485
  if timeout > 1000:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.14
3
+ Version: 1.25.15
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -26,12 +26,12 @@ Requires-Dist: aiohttp[speedups] (>=3.10.5,<4.0.0)
26
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
27
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
- Requires-Dist: playwright (>=1.46.0,<2.0.0)
29
+ Requires-Dist: playwright (>=1.47.0,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.27,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
- Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=74.0.0,<75.0.0)
33
+ Requires-Dist: pytz (>=2024.2,<2025.0) ; python_version < "3.9"
34
+ Requires-Dist: setuptools (>=74.1.2,<75.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -1,9 +1,9 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=9ESwyexAnBtGc2QV3u4WvOKZqisMVdqHL4rko1RxJXQ,78337
2
+ playwrightcapture/capture.py,sha256=8qoKqKnd3xt_6WuxJi2DmagC8fLx6O-RPnb5yPNkdTQ,79840
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.14.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.14.dist-info/METADATA,sha256=45rFgcxqSi2TAU8KwsE0dxdWja1u7tvHchj0x3H1dCM,3172
8
- playwrightcapture-1.25.14.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.14.dist-info/RECORD,,
6
+ playwrightcapture-1.25.15.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.15.dist-info/METADATA,sha256=H_yOrQiJCbAePrkN66zoShlhvNY4AyYfdV2aHKO5uJg,3172
8
+ playwrightcapture-1.25.15.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.15.dist-info/RECORD,,