PlaywrightCapture 1.28.2__py3-none-any.whl → 1.28.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -736,7 +736,7 @@ class Capture():
736
736
  got_button: bool = False
737
737
  try:
738
738
  try:
739
- async with timeout(5):
739
+ async with timeout(3):
740
740
  if await frame.locator("button.button__acceptAll").is_visible():
741
741
  self.logger.info('Consent window found, clicking through.')
742
742
  got_button = True
@@ -746,7 +746,7 @@ class Capture():
746
746
 
747
747
  for label in labels_to_click:
748
748
  try:
749
- async with timeout(5):
749
+ async with timeout(3):
750
750
  if await frame.get_by_label(label).is_visible():
751
751
  got_button = True
752
752
  self.logger.debug(f'Got button by label on frame: {label}')
@@ -756,7 +756,7 @@ class Capture():
756
756
  self.logger.warning(f'Consent timeout (label {label}) : {e}')
757
757
 
758
758
  try:
759
- async with timeout(5):
759
+ async with timeout(3):
760
760
  if await frame.get_by_role("button", name=label).is_visible():
761
761
  got_button = True
762
762
  self.logger.debug(f'Got button by role on frame: {label}')
@@ -780,7 +780,15 @@ class Capture():
780
780
  except Exception as e:
781
781
  self.logger.info(f'Error while moving time forward: {e}')
782
782
 
783
- async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, clock_set: bool) -> None:
783
+ async def __instrumentation(self, page: Page, url: str, allow_tracking: bool) -> None:
784
+ try:
785
+ # NOTE: the clock must be installed after the page is loaded, otherwise it sometimes cause the complete capture to hang.
786
+ await page.clock.install()
787
+ clock_set = True
788
+ except Error as e:
789
+ self.logger.warning(f'Unable to install the clock: {e}')
790
+ clock_set = False
791
+
784
792
  # page instrumentation
785
793
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
786
794
  self.logger.debug('Start instrumentation.')
@@ -920,9 +928,9 @@ class Capture():
920
928
  referer: str | None=None,
921
929
  page: Page | None=None, depth: int=0,
922
930
  rendered_hostname_only: bool=True,
931
+ with_screenshot: bool=True,
923
932
  with_favicon: bool=False,
924
933
  allow_tracking: bool=False,
925
- clock_set: bool=False
926
934
  ) -> CaptureResponse:
927
935
 
928
936
  to_return: CaptureResponse = {}
@@ -990,13 +998,6 @@ class Capture():
990
998
  self.should_retry = True
991
999
  return to_return
992
1000
 
993
- try:
994
- await page.clock.install()
995
- clock_set = True
996
- except Error as e:
997
- self.logger.warning(f'Unable to install the clock: {e}')
998
- clock_set = False
999
-
1000
1001
  if allow_tracking:
1001
1002
  # Add authorization clickthroughs
1002
1003
  await self.__dialog_didomi_clickthrough(page)
@@ -1019,8 +1020,8 @@ class Capture():
1019
1020
 
1020
1021
  try:
1021
1022
  try:
1022
- await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
1023
1023
  page.on("download", handle_download)
1024
+ await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
1024
1025
  except Error as initial_error:
1025
1026
  self._update_exceptions(initial_error)
1026
1027
  # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -1065,7 +1066,7 @@ class Capture():
1065
1066
 
1066
1067
  try:
1067
1068
  if self.headless:
1068
- await self.__instrumentation(page, url, allow_tracking, clock_set)
1069
+ await self.__instrumentation(page, url, allow_tracking)
1069
1070
  else:
1070
1071
  self.logger.debug('Headed mode, skipping instrumentation.')
1071
1072
  await self._wait_for_random_timeout(page, self._capture_timeout - 5)
@@ -1099,7 +1100,9 @@ class Capture():
1099
1100
  self.logger.warning(f'Unable to get favicons: {e}')
1100
1101
 
1101
1102
  to_return['last_redirected_url'] = page.url
1102
- to_return['png'] = await self._failsafe_get_screenshot(page)
1103
+
1104
+ if with_screenshot:
1105
+ to_return['png'] = await self._failsafe_get_screenshot(page)
1103
1106
 
1104
1107
  self._already_captured.add(url)
1105
1108
  if depth > 0 and to_return.get('html') and to_return['html']:
@@ -1131,7 +1134,7 @@ class Capture():
1131
1134
  page=page, depth=depth,
1132
1135
  rendered_hostname_only=rendered_hostname_only,
1133
1136
  max_depth_capture_time=max_capture_time,
1134
- clock_set=clock_set)
1137
+ with_screenshot=with_screenshot)
1135
1138
  to_return['children'].append(child_capture) # type: ignore[union-attr]
1136
1139
  except (TimeoutError, asyncio.TimeoutError):
1137
1140
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
@@ -1197,12 +1200,12 @@ class Capture():
1197
1200
  self.logger.debug('Finishing up capture.')
1198
1201
  if not capturing_sub:
1199
1202
  try:
1200
- to_return['storage'] = await self.context.storage_state(indexed_db=True)
1201
- to_return['cookies'] = await self.context.cookies()
1202
- self.logger.debug('Done with cookies.')
1203
+ to_return['storage'] = await self._failsafe_get_storage()
1204
+ to_return['cookies'] = await self._failsafe_get_cookies()
1205
+ self.logger.debug('Done with cookies and storage.')
1203
1206
  except Exception as e:
1204
1207
  if 'error' not in to_return:
1205
- to_return['error'] = f'Unable to get the cookies: {e}'
1208
+ to_return['error'] = f'Unable to get the storage: {e}'
1206
1209
  # frames_tree = self.make_frame_tree(page.main_frame)
1207
1210
  try:
1208
1211
  async with timeout(60):
@@ -1224,6 +1227,22 @@ class Capture():
1224
1227
  self.logger.debug('Capture done')
1225
1228
  return to_return
1226
1229
 
1230
+ async def _failsafe_get_cookies(self) -> list[Cookie] | None:
1231
+ try:
1232
+ async with timeout(15):
1233
+ return await self.context.cookies()
1234
+ except (TimeoutError, asyncio.TimeoutError):
1235
+ self.logger.warning("Unable to get cookies (timeout).")
1236
+ return None
1237
+
1238
+ async def _failsafe_get_storage(self) -> StorageState | None:
1239
+ try:
1240
+ async with timeout(15):
1241
+ return await self.context.storage_state(indexed_db=True)
1242
+ except (TimeoutError, asyncio.TimeoutError):
1243
+ self.logger.warning("Unable to get storage (timeout).")
1244
+ return None
1245
+
1227
1246
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1228
1247
  self.logger.debug("Capturing a screenshot of the full page.")
1229
1248
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.2
3
+ Version: 1.28.4
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.13
18
18
  Classifier: Topic :: Internet
19
19
  Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
- Requires-Dist: SpeechRecognition (>=3.14.1) ; extra == "recaptcha"
21
+ Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.11.14)
23
+ Requires-Dist: aiohttp[speedups] (>=3.11.16)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
26
26
  Requires-Dist: dateparser (>=1.2.1)
@@ -28,9 +28,9 @@ Requires-Dist: playwright (>=1.51.0)
28
28
  Requires-Dist: playwright-stealth (>=1.0.6)
29
29
  Requires-Dist: puremagic (>=1.28)
30
30
  Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
31
- Requires-Dist: setuptools (>=77.0.1)
31
+ Requires-Dist: setuptools (>=78.1.0)
32
32
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
33
- Requires-Dist: tzdata (>=2025.1)
33
+ Requires-Dist: tzdata (>=2025.2)
34
34
  Requires-Dist: w3lib (>=2.3.1)
35
35
  Project-URL: Issues, https://github.com/Lookyloo/PlaywrightCapture/issues
36
36
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -0,0 +1,9 @@
1
+ playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
+ playwrightcapture/capture.py,sha256=Iicc_nNjlztCMGIJ9wSB6UhKoIcVJCh_00BssV68XDU,82297
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture-1.28.4.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.28.4.dist-info/METADATA,sha256=o32IMwzDiGFMVmlmaHJF0JTg0p2r5_kZ9KLXuJlVI9M,3075
8
+ playwrightcapture-1.28.4.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
9
+ playwrightcapture-1.28.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.1
2
+ Generator: poetry-core 2.1.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,9 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=UbZ8kA_JLoWdu2-vxS0-ScV41DEp-WyTk1bYRY5YANA,81515
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.28.2.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.28.2.dist-info/METADATA,sha256=a_Jpi4OyZcmYkV82L-cHdPTuC_woBOlSnj-VTorJZKI,3075
8
- playwrightcapture-1.28.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
9
- playwrightcapture-1.28.2.dist-info/RECORD,,