PlaywrightCapture 1.28.3__tar.gz → 1.28.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.3
3
+ Version: 1.28.5
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -20,7 +20,7 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.11.14)
23
+ Requires-Dist: aiohttp[speedups] (>=3.11.16)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
26
26
  Requires-Dist: dateparser (>=1.2.1)
@@ -159,8 +159,9 @@ class Capture():
159
159
  master_logger = logging.getLogger('playwrightcapture')
160
160
  master_logger.setLevel(loglevel)
161
161
  self.logger: Logger | PlaywrightCaptureLogAdapter
162
- if uuid is not None:
163
- self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
162
+ self.uuid = uuid
163
+ if self.uuid is not None:
164
+ self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': self.uuid})
164
165
  else:
165
166
  self.logger = master_logger
166
167
  self.browser_name: BROWSER = browser if browser else 'chromium'
@@ -736,7 +737,7 @@ class Capture():
736
737
  got_button: bool = False
737
738
  try:
738
739
  try:
739
- async with timeout(5):
740
+ async with timeout(3):
740
741
  if await frame.locator("button.button__acceptAll").is_visible():
741
742
  self.logger.info('Consent window found, clicking through.')
742
743
  got_button = True
@@ -746,7 +747,7 @@ class Capture():
746
747
 
747
748
  for label in labels_to_click:
748
749
  try:
749
- async with timeout(5):
750
+ async with timeout(3):
750
751
  if await frame.get_by_label(label).is_visible():
751
752
  got_button = True
752
753
  self.logger.debug(f'Got button by label on frame: {label}')
@@ -756,7 +757,7 @@ class Capture():
756
757
  self.logger.warning(f'Consent timeout (label {label}) : {e}')
757
758
 
758
759
  try:
759
- async with timeout(5):
760
+ async with timeout(3):
760
761
  if await frame.get_by_role("button", name=label).is_visible():
761
762
  got_button = True
762
763
  self.logger.debug(f'Got button by role on frame: {label}')
@@ -780,7 +781,15 @@ class Capture():
780
781
  except Exception as e:
781
782
  self.logger.info(f'Error while moving time forward: {e}')
782
783
 
783
- async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, clock_set: bool) -> None:
784
+ async def __instrumentation(self, page: Page, url: str, allow_tracking: bool) -> None:
785
+ try:
786
+ # NOTE: the clock must be installed after the page is loaded, otherwise it sometimes cause the complete capture to hang.
787
+ await page.clock.install()
788
+ clock_set = True
789
+ except Error as e:
790
+ self.logger.warning(f'Unable to install the clock: {e}')
791
+ clock_set = False
792
+
784
793
  # page instrumentation
785
794
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
786
795
  self.logger.debug('Start instrumentation.')
@@ -923,7 +932,6 @@ class Capture():
923
932
  with_screenshot: bool=True,
924
933
  with_favicon: bool=False,
925
934
  allow_tracking: bool=False,
926
- clock_set: bool=False
927
935
  ) -> CaptureResponse:
928
936
 
929
937
  to_return: CaptureResponse = {}
@@ -991,13 +999,6 @@ class Capture():
991
999
  self.should_retry = True
992
1000
  return to_return
993
1001
 
994
- try:
995
- await page.clock.install()
996
- clock_set = True
997
- except Error as e:
998
- self.logger.warning(f'Unable to install the clock: {e}')
999
- clock_set = False
1000
-
1001
1002
  if allow_tracking:
1002
1003
  # Add authorization clickthroughs
1003
1004
  await self.__dialog_didomi_clickthrough(page)
@@ -1020,8 +1021,8 @@ class Capture():
1020
1021
 
1021
1022
  try:
1022
1023
  try:
1023
- await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
1024
1024
  page.on("download", handle_download)
1025
+ await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
1025
1026
  except Error as initial_error:
1026
1027
  self._update_exceptions(initial_error)
1027
1028
  # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -1066,27 +1067,13 @@ class Capture():
1066
1067
 
1067
1068
  try:
1068
1069
  if self.headless:
1069
- await self.__instrumentation(page, url, allow_tracking, clock_set)
1070
+ await self.__instrumentation(page, url, allow_tracking)
1070
1071
  else:
1071
1072
  self.logger.debug('Headed mode, skipping instrumentation.')
1072
1073
  await self._wait_for_random_timeout(page, self._capture_timeout - 5)
1073
1074
  except Exception as e:
1074
1075
  self.logger.exception(f'Error during instrumentation: {e}')
1075
1076
 
1076
- if multiple_downloads:
1077
- if len(multiple_downloads) == 1:
1078
- to_return["downloaded_filename"] = multiple_downloads[0][0]
1079
- to_return["downloaded_file"] = multiple_downloads[0][1]
1080
- else:
1081
- # we have multiple downloads, making it a zip
1082
- mem_zip = BytesIO()
1083
- to_return["downloaded_filename"] = 'multiple_downloads.zip'
1084
- with ZipFile(mem_zip, 'w') as z:
1085
- for i, f_details in enumerate(multiple_downloads):
1086
- filename, file_content = f_details
1087
- z.writestr(f'{i}_{filename}', file_content)
1088
- to_return["downloaded_file"] = mem_zip.getvalue()
1089
-
1090
1077
  if content := await self._failsafe_get_content(page):
1091
1078
  to_return['html'] = content
1092
1079
 
@@ -1134,7 +1121,7 @@ class Capture():
1134
1121
  page=page, depth=depth,
1135
1122
  rendered_hostname_only=rendered_hostname_only,
1136
1123
  max_depth_capture_time=max_capture_time,
1137
- clock_set=clock_set, with_screenshot=with_screenshot)
1124
+ with_screenshot=with_screenshot)
1138
1125
  to_return['children'].append(child_capture) # type: ignore[union-attr]
1139
1126
  except (TimeoutError, asyncio.TimeoutError):
1140
1127
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
@@ -1199,13 +1186,27 @@ class Capture():
1199
1186
  finally:
1200
1187
  self.logger.debug('Finishing up capture.')
1201
1188
  if not capturing_sub:
1189
+ if multiple_downloads:
1190
+ if len(multiple_downloads) == 1:
1191
+ to_return["downloaded_filename"] = multiple_downloads[0][0]
1192
+ to_return["downloaded_file"] = multiple_downloads[0][1]
1193
+ else:
1194
+ # we have multiple downloads, making it a zip, make sure the filename is unique
1195
+ mem_zip = BytesIO()
1196
+ to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip'
1197
+ with ZipFile(mem_zip, 'w') as z:
1198
+ for i, f_details in enumerate(multiple_downloads):
1199
+ filename, file_content = f_details
1200
+ z.writestr(f'{i}_{filename}', file_content)
1201
+ to_return["downloaded_file"] = mem_zip.getvalue()
1202
+
1202
1203
  try:
1203
- to_return['storage'] = await self.context.storage_state(indexed_db=True)
1204
- to_return['cookies'] = await self.context.cookies()
1205
- self.logger.debug('Done with cookies.')
1204
+ to_return['storage'] = await self._failsafe_get_storage()
1205
+ to_return['cookies'] = await self._failsafe_get_cookies()
1206
+ self.logger.debug('Done with cookies and storage.')
1206
1207
  except Exception as e:
1207
1208
  if 'error' not in to_return:
1208
- to_return['error'] = f'Unable to get the cookies: {e}'
1209
+ to_return['error'] = f'Unable to get the storage: {e}'
1209
1210
  # frames_tree = self.make_frame_tree(page.main_frame)
1210
1211
  try:
1211
1212
  async with timeout(60):
@@ -1227,6 +1228,24 @@ class Capture():
1227
1228
  self.logger.debug('Capture done')
1228
1229
  return to_return
1229
1230
 
1231
+ async def _failsafe_get_cookies(self) -> list[Cookie] | None:
1232
+ try:
1233
+ async with timeout(15):
1234
+ return await self.context.cookies()
1235
+ except (TimeoutError, asyncio.TimeoutError):
1236
+ self.logger.warning("Unable to get cookies (timeout).")
1237
+ return None
1238
+
1239
+ async def _failsafe_get_storage(self) -> StorageState | None:
1240
+ try:
1241
+ async with timeout(15):
1242
+ return await self.context.storage_state(indexed_db=True)
1243
+ except (TimeoutError, asyncio.TimeoutError):
1244
+ self.logger.warning("Unable to get storage (timeout).")
1245
+ except Error as e:
1246
+ self.logger.warning(f"Unable to get storage: {e}")
1247
+ return None
1248
+
1230
1249
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1231
1250
  self.logger.debug("Capturing a screenshot of the full page.")
1232
1251
  try:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.28.3"
3
+ version = "1.28.5"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -21,7 +21,7 @@ dependencies = [
21
21
  "setuptools (>=78.1.0)",
22
22
  "puremagic (>=1.28)",
23
23
  "async-timeout (>=5.0.1) ; python_version < \"3.11\"",
24
- "aiohttp[speedups] (>=3.11.14)",
24
+ "aiohttp[speedups] (>=3.11.16)",
25
25
  "aiohttp-socks (>=0.10.1)",
26
26
  "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
27
27
  ]