PlaywrightCapture 1.28.5__tar.gz → 1.28.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.5
3
+ Version: 1.28.6
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -935,6 +935,7 @@ class Capture():
935
935
  ) -> CaptureResponse:
936
936
 
937
937
  to_return: CaptureResponse = {}
938
+ errors: list[str] = []
938
939
  got_favicons = False
939
940
 
940
941
  # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
@@ -997,6 +998,7 @@ class Capture():
997
998
  except Error as e:
998
999
  self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
999
1000
  self.should_retry = True
1001
+ to_return['error'] = f'Unable to create new page: {e}'
1000
1002
  return to_return
1001
1003
 
1002
1004
  if allow_tracking:
@@ -1050,8 +1052,8 @@ class Capture():
1050
1052
  error_msg = download.failure()
1051
1053
  if not error_msg:
1052
1054
  raise e
1053
- to_return['error'] = f"Error while downloading: {error_msg}"
1054
- self.logger.info(to_return['error'])
1055
+ errors.append(f"Error while downloading: {error_msg}")
1056
+ self.logger.info(f'Error while downloading: {error_msg}')
1055
1057
  self.should_retry = True
1056
1058
  except Exception:
1057
1059
  raise e
@@ -1137,7 +1139,7 @@ class Capture():
1137
1139
  if consecutive_errors >= 5:
1138
1140
  # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
1139
1141
  self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
1140
- to_return['error'] = "Got more than 5 consecutive errors while capturing children"
1142
+ errors.append("Got more than 5 consecutive errors while capturing children")
1141
1143
  self.should_retry = True
1142
1144
  break
1143
1145
 
@@ -1149,19 +1151,19 @@ class Capture():
1149
1151
  self.logger.info(f'Unable to go back: {e}.')
1150
1152
 
1151
1153
  except PlaywrightTimeoutError as e:
1152
- to_return['error'] = f"The capture took too long - {e.message}"
1154
+ errors.append(f"The capture took too long - {e.message}")
1153
1155
  self.should_retry = True
1154
1156
  except (asyncio.TimeoutError, TimeoutError):
1155
- to_return['error'] = "Something in the capture took too long"
1157
+ errors.append("Something in the capture took too long")
1156
1158
  self.should_retry = True
1157
1159
  except TargetClosedError as e:
1158
- to_return['error'] = f"The target was closed - {e}"
1160
+ errors.append(f"The target was closed - {e}")
1159
1161
  self.should_retry = True
1160
1162
  except Error as e:
1161
- # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
1162
- # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
1163
+ # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process.
1164
+ # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time.
1163
1165
  self._update_exceptions(e)
1164
- to_return['error'] = e.message
1166
+ errors.append(e.message)
1165
1167
  to_return['error_name'] = e.name
1166
1168
  # TODO: check e.message and figure out if it is worth retrying or not.
1167
1169
  # NOTE: e.name is generally (always?) "Error"
@@ -1170,6 +1172,7 @@ class Capture():
1170
1172
  elif self._retry_network_error(e) or self._retry_browser_error(e):
1171
1173
  # this one sounds like something we can retry...
1172
1174
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
1175
+ errors.append(f'Issue with {url}: {e.message}')
1173
1176
  self.should_retry = True
1174
1177
  else:
1175
1178
  # Unexpected ones
@@ -1177,9 +1180,10 @@ class Capture():
1177
1180
  except Exception as e:
1178
1181
  # we may get a non-playwright exception to.
1179
1182
  # The ones we try to handle here should be treated as if they were.
1180
- to_return['error'] = str(e)
1181
- if to_return['error'] in ['Connection closed while reading from the driver']:
1183
+ errors.append(str(e))
1184
+ if str(e) in ['Connection closed while reading from the driver']:
1182
1185
  self.logger.info(f'Issue with {url} (retrying): {e}')
1186
+ errors.append(f'Issue with {url}: {e}')
1183
1187
  self.should_retry = True
1184
1188
  else:
1185
1189
  raise e
@@ -1201,15 +1205,31 @@ class Capture():
1201
1205
  to_return["downloaded_file"] = mem_zip.getvalue()
1202
1206
 
1203
1207
  try:
1204
- to_return['storage'] = await self._failsafe_get_storage()
1205
- to_return['cookies'] = await self._failsafe_get_cookies()
1206
- self.logger.debug('Done with cookies and storage.')
1207
- except Exception as e:
1208
- if 'error' not in to_return:
1209
- to_return['error'] = f'Unable to get the storage: {e}'
1208
+ async with timeout(15):
1209
+ to_return['cookies'] = await self.context.cookies()
1210
+ except (TimeoutError, asyncio.TimeoutError):
1211
+ self.logger.warning("Unable to get cookies (timeout).")
1212
+ errors.append("Unable to get the cookies (timeout).")
1213
+ self.should_retry = True
1214
+ except Error as e:
1215
+ self.logger.warning(f"Unable to get cookies: {e}")
1216
+ errors.append(f'Unable to get the cookies: {e}')
1217
+ self.should_retry = True
1218
+
1219
+ try:
1220
+ async with timeout(15):
1221
+ to_return['storage'] = await self.context.storage_state(indexed_db=True)
1222
+ except (TimeoutError, asyncio.TimeoutError):
1223
+ self.logger.warning("Unable to get storage (timeout).")
1224
+ errors.append("Unable to get the storage (timeout).")
1225
+ self.should_retry = True
1226
+ except Error as e:
1227
+ self.logger.warning(f"Unable to get the storage: {e}")
1228
+ errors.append(f'Unable to get the storage: {e}')
1229
+ self.should_retry = True
1210
1230
  # frames_tree = self.make_frame_tree(page.main_frame)
1211
1231
  try:
1212
- async with timeout(60):
1232
+ async with timeout(30):
1213
1233
  page.remove_listener("requestfinished", store_request)
1214
1234
  await page.close(reason="Closing the page because the capture finished.")
1215
1235
  self.logger.debug('Page closed.')
@@ -1220,32 +1240,16 @@ class Capture():
1220
1240
  self.logger.debug('Got HAR.')
1221
1241
  except (TimeoutError, asyncio.TimeoutError):
1222
1242
  self.logger.warning("Unable to close page and context at the end of the capture.")
1243
+ errors.append("Unable to close page and context at the end of the capture.")
1223
1244
  self.should_retry = True
1224
1245
  except Exception as e:
1225
1246
  self.logger.warning(f"Other exception while finishing up the capture: {e}.")
1226
- if 'error' not in to_return:
1227
- to_return['error'] = f'Unable to generate HAR file: {e}'
1247
+ errors.append(f'Unable to generate HAR file: {e}')
1228
1248
  self.logger.debug('Capture done')
1249
+ if errors:
1250
+ to_return['error'] = '\n'.join(errors)
1229
1251
  return to_return
1230
1252
 
1231
- async def _failsafe_get_cookies(self) -> list[Cookie] | None:
1232
- try:
1233
- async with timeout(15):
1234
- return await self.context.cookies()
1235
- except (TimeoutError, asyncio.TimeoutError):
1236
- self.logger.warning("Unable to get cookies (timeout).")
1237
- return None
1238
-
1239
- async def _failsafe_get_storage(self) -> StorageState | None:
1240
- try:
1241
- async with timeout(15):
1242
- return await self.context.storage_state(indexed_db=True)
1243
- except (TimeoutError, asyncio.TimeoutError):
1244
- self.logger.warning("Unable to get storage (timeout).")
1245
- except Error as e:
1246
- self.logger.warning(f"Unable to get storage: {e}")
1247
- return None
1248
-
1249
1253
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1250
1254
  self.logger.debug("Capturing a screenshot of the full page.")
1251
1255
  try:
@@ -1296,7 +1300,7 @@ class Capture():
1296
1300
  tries = 3
1297
1301
  while tries:
1298
1302
  try:
1299
- async with timeout(30):
1303
+ async with timeout(15):
1300
1304
  return await page.content()
1301
1305
  except (Error, TimeoutError, asyncio.TimeoutError):
1302
1306
  self.logger.debug('Unable to get page content, trying again.')
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.28.5"
3
+ version = "1.28.6"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -49,7 +49,7 @@ recaptcha = [
49
49
  types-beautifulsoup4 = "^4.12.0.20250204"
50
50
  pytest = "^8.3.5"
51
51
  mypy = "^1.15.0"
52
- types-dateparser = "^1.2.0.20250208"
52
+ types-dateparser = "^1.2.0.20250408"
53
53
  types-pytz = "^2025.2.0.20250326"
54
54
 
55
55