PlaywrightCapture 1.28.4__tar.gz → 1.28.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.4
3
+ Version: 1.28.6
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -159,8 +159,9 @@ class Capture():
159
159
  master_logger = logging.getLogger('playwrightcapture')
160
160
  master_logger.setLevel(loglevel)
161
161
  self.logger: Logger | PlaywrightCaptureLogAdapter
162
- if uuid is not None:
163
- self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
162
+ self.uuid = uuid
163
+ if self.uuid is not None:
164
+ self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': self.uuid})
164
165
  else:
165
166
  self.logger = master_logger
166
167
  self.browser_name: BROWSER = browser if browser else 'chromium'
@@ -934,6 +935,7 @@ class Capture():
934
935
  ) -> CaptureResponse:
935
936
 
936
937
  to_return: CaptureResponse = {}
938
+ errors: list[str] = []
937
939
  got_favicons = False
938
940
 
939
941
  # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
@@ -996,6 +998,7 @@ class Capture():
996
998
  except Error as e:
997
999
  self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
998
1000
  self.should_retry = True
1001
+ to_return['error'] = f'Unable to create new page: {e}'
999
1002
  return to_return
1000
1003
 
1001
1004
  if allow_tracking:
@@ -1049,8 +1052,8 @@ class Capture():
1049
1052
  error_msg = download.failure()
1050
1053
  if not error_msg:
1051
1054
  raise e
1052
- to_return['error'] = f"Error while downloading: {error_msg}"
1053
- self.logger.info(to_return['error'])
1055
+ errors.append(f"Error while downloading: {error_msg}")
1056
+ self.logger.info(f'Error while downloading: {error_msg}')
1054
1057
  self.should_retry = True
1055
1058
  except Exception:
1056
1059
  raise e
@@ -1073,20 +1076,6 @@ class Capture():
1073
1076
  except Exception as e:
1074
1077
  self.logger.exception(f'Error during instrumentation: {e}')
1075
1078
 
1076
- if multiple_downloads:
1077
- if len(multiple_downloads) == 1:
1078
- to_return["downloaded_filename"] = multiple_downloads[0][0]
1079
- to_return["downloaded_file"] = multiple_downloads[0][1]
1080
- else:
1081
- # we have multiple downloads, making it a zip
1082
- mem_zip = BytesIO()
1083
- to_return["downloaded_filename"] = 'multiple_downloads.zip'
1084
- with ZipFile(mem_zip, 'w') as z:
1085
- for i, f_details in enumerate(multiple_downloads):
1086
- filename, file_content = f_details
1087
- z.writestr(f'{i}_{filename}', file_content)
1088
- to_return["downloaded_file"] = mem_zip.getvalue()
1089
-
1090
1079
  if content := await self._failsafe_get_content(page):
1091
1080
  to_return['html'] = content
1092
1081
 
@@ -1150,7 +1139,7 @@ class Capture():
1150
1139
  if consecutive_errors >= 5:
1151
1140
  # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
1152
1141
  self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
1153
- to_return['error'] = "Got more than 5 consecutive errors while capturing children"
1142
+ errors.append("Got more than 5 consecutive errors while capturing children")
1154
1143
  self.should_retry = True
1155
1144
  break
1156
1145
 
@@ -1162,19 +1151,19 @@ class Capture():
1162
1151
  self.logger.info(f'Unable to go back: {e}.')
1163
1152
 
1164
1153
  except PlaywrightTimeoutError as e:
1165
- to_return['error'] = f"The capture took too long - {e.message}"
1154
+ errors.append(f"The capture took too long - {e.message}")
1166
1155
  self.should_retry = True
1167
1156
  except (asyncio.TimeoutError, TimeoutError):
1168
- to_return['error'] = "Something in the capture took too long"
1157
+ errors.append("Something in the capture took too long")
1169
1158
  self.should_retry = True
1170
1159
  except TargetClosedError as e:
1171
- to_return['error'] = f"The target was closed - {e}"
1160
+ errors.append(f"The target was closed - {e}")
1172
1161
  self.should_retry = True
1173
1162
  except Error as e:
1174
- # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
1175
- # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
1163
+ # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process.
1164
+ # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time.
1176
1165
  self._update_exceptions(e)
1177
- to_return['error'] = e.message
1166
+ errors.append(e.message)
1178
1167
  to_return['error_name'] = e.name
1179
1168
  # TODO: check e.message and figure out if it is worth retrying or not.
1180
1169
  # NOTE: e.name is generally (always?) "Error"
@@ -1183,6 +1172,7 @@ class Capture():
1183
1172
  elif self._retry_network_error(e) or self._retry_browser_error(e):
1184
1173
  # this one sounds like something we can retry...
1185
1174
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
1175
+ errors.append(f'Issue with {url}: {e.message}')
1186
1176
  self.should_retry = True
1187
1177
  else:
1188
1178
  # Unexpected ones
@@ -1190,25 +1180,56 @@ class Capture():
1190
1180
  except Exception as e:
1191
1181
  # we may get a non-playwright exception to.
1192
1182
  # The ones we try to handle here should be treated as if they were.
1193
- to_return['error'] = str(e)
1194
- if to_return['error'] in ['Connection closed while reading from the driver']:
1183
+ errors.append(str(e))
1184
+ if str(e) in ['Connection closed while reading from the driver']:
1195
1185
  self.logger.info(f'Issue with {url} (retrying): {e}')
1186
+ errors.append(f'Issue with {url}: {e}')
1196
1187
  self.should_retry = True
1197
1188
  else:
1198
1189
  raise e
1199
1190
  finally:
1200
1191
  self.logger.debug('Finishing up capture.')
1201
1192
  if not capturing_sub:
1193
+ if multiple_downloads:
1194
+ if len(multiple_downloads) == 1:
1195
+ to_return["downloaded_filename"] = multiple_downloads[0][0]
1196
+ to_return["downloaded_file"] = multiple_downloads[0][1]
1197
+ else:
1198
+ # we have multiple downloads, making it a zip, make sure the filename is unique
1199
+ mem_zip = BytesIO()
1200
+ to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip'
1201
+ with ZipFile(mem_zip, 'w') as z:
1202
+ for i, f_details in enumerate(multiple_downloads):
1203
+ filename, file_content = f_details
1204
+ z.writestr(f'{i}_{filename}', file_content)
1205
+ to_return["downloaded_file"] = mem_zip.getvalue()
1206
+
1202
1207
  try:
1203
- to_return['storage'] = await self._failsafe_get_storage()
1204
- to_return['cookies'] = await self._failsafe_get_cookies()
1205
- self.logger.debug('Done with cookies and storage.')
1206
- except Exception as e:
1207
- if 'error' not in to_return:
1208
- to_return['error'] = f'Unable to get the storage: {e}'
1208
+ async with timeout(15):
1209
+ to_return['cookies'] = await self.context.cookies()
1210
+ except (TimeoutError, asyncio.TimeoutError):
1211
+ self.logger.warning("Unable to get cookies (timeout).")
1212
+ errors.append("Unable to get the cookies (timeout).")
1213
+ self.should_retry = True
1214
+ except Error as e:
1215
+ self.logger.warning(f"Unable to get cookies: {e}")
1216
+ errors.append(f'Unable to get the cookies: {e}')
1217
+ self.should_retry = True
1218
+
1219
+ try:
1220
+ async with timeout(15):
1221
+ to_return['storage'] = await self.context.storage_state(indexed_db=True)
1222
+ except (TimeoutError, asyncio.TimeoutError):
1223
+ self.logger.warning("Unable to get storage (timeout).")
1224
+ errors.append("Unable to get the storage (timeout).")
1225
+ self.should_retry = True
1226
+ except Error as e:
1227
+ self.logger.warning(f"Unable to get the storage: {e}")
1228
+ errors.append(f'Unable to get the storage: {e}')
1229
+ self.should_retry = True
1209
1230
  # frames_tree = self.make_frame_tree(page.main_frame)
1210
1231
  try:
1211
- async with timeout(60):
1232
+ async with timeout(30):
1212
1233
  page.remove_listener("requestfinished", store_request)
1213
1234
  await page.close(reason="Closing the page because the capture finished.")
1214
1235
  self.logger.debug('Page closed.')
@@ -1219,30 +1240,16 @@ class Capture():
1219
1240
  self.logger.debug('Got HAR.')
1220
1241
  except (TimeoutError, asyncio.TimeoutError):
1221
1242
  self.logger.warning("Unable to close page and context at the end of the capture.")
1243
+ errors.append("Unable to close page and context at the end of the capture.")
1222
1244
  self.should_retry = True
1223
1245
  except Exception as e:
1224
1246
  self.logger.warning(f"Other exception while finishing up the capture: {e}.")
1225
- if 'error' not in to_return:
1226
- to_return['error'] = f'Unable to generate HAR file: {e}'
1247
+ errors.append(f'Unable to generate HAR file: {e}')
1227
1248
  self.logger.debug('Capture done')
1249
+ if errors:
1250
+ to_return['error'] = '\n'.join(errors)
1228
1251
  return to_return
1229
1252
 
1230
- async def _failsafe_get_cookies(self) -> list[Cookie] | None:
1231
- try:
1232
- async with timeout(15):
1233
- return await self.context.cookies()
1234
- except (TimeoutError, asyncio.TimeoutError):
1235
- self.logger.warning("Unable to get cookies (timeout).")
1236
- return None
1237
-
1238
- async def _failsafe_get_storage(self) -> StorageState | None:
1239
- try:
1240
- async with timeout(15):
1241
- return await self.context.storage_state(indexed_db=True)
1242
- except (TimeoutError, asyncio.TimeoutError):
1243
- self.logger.warning("Unable to get storage (timeout).")
1244
- return None
1245
-
1246
1253
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1247
1254
  self.logger.debug("Capturing a screenshot of the full page.")
1248
1255
  try:
@@ -1293,7 +1300,7 @@ class Capture():
1293
1300
  tries = 3
1294
1301
  while tries:
1295
1302
  try:
1296
- async with timeout(30):
1303
+ async with timeout(15):
1297
1304
  return await page.content()
1298
1305
  except (Error, TimeoutError, asyncio.TimeoutError):
1299
1306
  self.logger.debug('Unable to get page content, trying again.')
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.28.4"
3
+ version = "1.28.6"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -49,7 +49,7 @@ recaptcha = [
49
49
  types-beautifulsoup4 = "^4.12.0.20250204"
50
50
  pytest = "^8.3.5"
51
51
  mypy = "^1.15.0"
52
- types-dateparser = "^1.2.0.20250208"
52
+ types-dateparser = "^1.2.0.20250408"
53
53
  types-pytz = "^2025.2.0.20250326"
54
54
 
55
55