PlaywrightCapture 1.28.2__py3-none-any.whl → 1.28.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +39 -20
- {playwrightcapture-1.28.2.dist-info → playwrightcapture-1.28.4.dist-info}/METADATA +5 -5
- playwrightcapture-1.28.4.dist-info/RECORD +9 -0
- {playwrightcapture-1.28.2.dist-info → playwrightcapture-1.28.4.dist-info}/WHEEL +1 -1
- playwrightcapture-1.28.2.dist-info/RECORD +0 -9
- {playwrightcapture-1.28.2.dist-info → playwrightcapture-1.28.4.dist-info}/LICENSE +0 -0
playwrightcapture/capture.py
CHANGED
@@ -736,7 +736,7 @@ class Capture():
|
|
736
736
|
got_button: bool = False
|
737
737
|
try:
|
738
738
|
try:
|
739
|
-
async with timeout(
|
739
|
+
async with timeout(3):
|
740
740
|
if await frame.locator("button.button__acceptAll").is_visible():
|
741
741
|
self.logger.info('Consent window found, clicking through.')
|
742
742
|
got_button = True
|
@@ -746,7 +746,7 @@ class Capture():
|
|
746
746
|
|
747
747
|
for label in labels_to_click:
|
748
748
|
try:
|
749
|
-
async with timeout(
|
749
|
+
async with timeout(3):
|
750
750
|
if await frame.get_by_label(label).is_visible():
|
751
751
|
got_button = True
|
752
752
|
self.logger.debug(f'Got button by label on frame: {label}')
|
@@ -756,7 +756,7 @@ class Capture():
|
|
756
756
|
self.logger.warning(f'Consent timeout (label {label}) : {e}')
|
757
757
|
|
758
758
|
try:
|
759
|
-
async with timeout(
|
759
|
+
async with timeout(3):
|
760
760
|
if await frame.get_by_role("button", name=label).is_visible():
|
761
761
|
got_button = True
|
762
762
|
self.logger.debug(f'Got button by role on frame: {label}')
|
@@ -780,7 +780,15 @@ class Capture():
|
|
780
780
|
except Exception as e:
|
781
781
|
self.logger.info(f'Error while moving time forward: {e}')
|
782
782
|
|
783
|
-
async def __instrumentation(self, page: Page, url: str, allow_tracking: bool
|
783
|
+
async def __instrumentation(self, page: Page, url: str, allow_tracking: bool) -> None:
|
784
|
+
try:
|
785
|
+
# NOTE: the clock must be installed after the page is loaded, otherwise it sometimes cause the complete capture to hang.
|
786
|
+
await page.clock.install()
|
787
|
+
clock_set = True
|
788
|
+
except Error as e:
|
789
|
+
self.logger.warning(f'Unable to install the clock: {e}')
|
790
|
+
clock_set = False
|
791
|
+
|
784
792
|
# page instrumentation
|
785
793
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
786
794
|
self.logger.debug('Start instrumentation.')
|
@@ -920,9 +928,9 @@ class Capture():
|
|
920
928
|
referer: str | None=None,
|
921
929
|
page: Page | None=None, depth: int=0,
|
922
930
|
rendered_hostname_only: bool=True,
|
931
|
+
with_screenshot: bool=True,
|
923
932
|
with_favicon: bool=False,
|
924
933
|
allow_tracking: bool=False,
|
925
|
-
clock_set: bool=False
|
926
934
|
) -> CaptureResponse:
|
927
935
|
|
928
936
|
to_return: CaptureResponse = {}
|
@@ -990,13 +998,6 @@ class Capture():
|
|
990
998
|
self.should_retry = True
|
991
999
|
return to_return
|
992
1000
|
|
993
|
-
try:
|
994
|
-
await page.clock.install()
|
995
|
-
clock_set = True
|
996
|
-
except Error as e:
|
997
|
-
self.logger.warning(f'Unable to install the clock: {e}')
|
998
|
-
clock_set = False
|
999
|
-
|
1000
1001
|
if allow_tracking:
|
1001
1002
|
# Add authorization clickthroughs
|
1002
1003
|
await self.__dialog_didomi_clickthrough(page)
|
@@ -1019,8 +1020,8 @@ class Capture():
|
|
1019
1020
|
|
1020
1021
|
try:
|
1021
1022
|
try:
|
1022
|
-
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
1023
1023
|
page.on("download", handle_download)
|
1024
|
+
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
1024
1025
|
except Error as initial_error:
|
1025
1026
|
self._update_exceptions(initial_error)
|
1026
1027
|
# So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
|
@@ -1065,7 +1066,7 @@ class Capture():
|
|
1065
1066
|
|
1066
1067
|
try:
|
1067
1068
|
if self.headless:
|
1068
|
-
await self.__instrumentation(page, url, allow_tracking
|
1069
|
+
await self.__instrumentation(page, url, allow_tracking)
|
1069
1070
|
else:
|
1070
1071
|
self.logger.debug('Headed mode, skipping instrumentation.')
|
1071
1072
|
await self._wait_for_random_timeout(page, self._capture_timeout - 5)
|
@@ -1099,7 +1100,9 @@ class Capture():
|
|
1099
1100
|
self.logger.warning(f'Unable to get favicons: {e}')
|
1100
1101
|
|
1101
1102
|
to_return['last_redirected_url'] = page.url
|
1102
|
-
|
1103
|
+
|
1104
|
+
if with_screenshot:
|
1105
|
+
to_return['png'] = await self._failsafe_get_screenshot(page)
|
1103
1106
|
|
1104
1107
|
self._already_captured.add(url)
|
1105
1108
|
if depth > 0 and to_return.get('html') and to_return['html']:
|
@@ -1131,7 +1134,7 @@ class Capture():
|
|
1131
1134
|
page=page, depth=depth,
|
1132
1135
|
rendered_hostname_only=rendered_hostname_only,
|
1133
1136
|
max_depth_capture_time=max_capture_time,
|
1134
|
-
|
1137
|
+
with_screenshot=with_screenshot)
|
1135
1138
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
1136
1139
|
except (TimeoutError, asyncio.TimeoutError):
|
1137
1140
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
@@ -1197,12 +1200,12 @@ class Capture():
|
|
1197
1200
|
self.logger.debug('Finishing up capture.')
|
1198
1201
|
if not capturing_sub:
|
1199
1202
|
try:
|
1200
|
-
to_return['storage'] = await self.
|
1201
|
-
to_return['cookies'] = await self.
|
1202
|
-
self.logger.debug('Done with cookies.')
|
1203
|
+
to_return['storage'] = await self._failsafe_get_storage()
|
1204
|
+
to_return['cookies'] = await self._failsafe_get_cookies()
|
1205
|
+
self.logger.debug('Done with cookies and storage.')
|
1203
1206
|
except Exception as e:
|
1204
1207
|
if 'error' not in to_return:
|
1205
|
-
to_return['error'] = f'Unable to get the
|
1208
|
+
to_return['error'] = f'Unable to get the storage: {e}'
|
1206
1209
|
# frames_tree = self.make_frame_tree(page.main_frame)
|
1207
1210
|
try:
|
1208
1211
|
async with timeout(60):
|
@@ -1224,6 +1227,22 @@ class Capture():
|
|
1224
1227
|
self.logger.debug('Capture done')
|
1225
1228
|
return to_return
|
1226
1229
|
|
1230
|
+
async def _failsafe_get_cookies(self) -> list[Cookie] | None:
|
1231
|
+
try:
|
1232
|
+
async with timeout(15):
|
1233
|
+
return await self.context.cookies()
|
1234
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1235
|
+
self.logger.warning("Unable to get cookies (timeout).")
|
1236
|
+
return None
|
1237
|
+
|
1238
|
+
async def _failsafe_get_storage(self) -> StorageState | None:
|
1239
|
+
try:
|
1240
|
+
async with timeout(15):
|
1241
|
+
return await self.context.storage_state(indexed_db=True)
|
1242
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1243
|
+
self.logger.warning("Unable to get storage (timeout).")
|
1244
|
+
return None
|
1245
|
+
|
1227
1246
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
1228
1247
|
self.logger.debug("Capturing a screenshot of the full page.")
|
1229
1248
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.28.
|
3
|
+
Version: 1.28.4
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -18,9 +18,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Topic :: Internet
|
19
19
|
Classifier: Topic :: Security
|
20
20
|
Provides-Extra: recaptcha
|
21
|
-
Requires-Dist: SpeechRecognition (>=3.14.
|
21
|
+
Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
|
22
22
|
Requires-Dist: aiohttp-socks (>=0.10.1)
|
23
|
-
Requires-Dist: aiohttp[speedups] (>=3.11.
|
23
|
+
Requires-Dist: aiohttp[speedups] (>=3.11.16)
|
24
24
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
25
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
|
26
26
|
Requires-Dist: dateparser (>=1.2.1)
|
@@ -28,9 +28,9 @@ Requires-Dist: playwright (>=1.51.0)
|
|
28
28
|
Requires-Dist: playwright-stealth (>=1.0.6)
|
29
29
|
Requires-Dist: puremagic (>=1.28)
|
30
30
|
Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
|
31
|
-
Requires-Dist: setuptools (>=
|
31
|
+
Requires-Dist: setuptools (>=78.1.0)
|
32
32
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
|
33
|
-
Requires-Dist: tzdata (>=2025.
|
33
|
+
Requires-Dist: tzdata (>=2025.2)
|
34
34
|
Requires-Dist: w3lib (>=2.3.1)
|
35
35
|
Project-URL: Issues, https://github.com/Lookyloo/PlaywrightCapture/issues
|
36
36
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=Iicc_nNjlztCMGIJ9wSB6UhKoIcVJCh_00BssV68XDU,82297
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.28.4.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.28.4.dist-info/METADATA,sha256=o32IMwzDiGFMVmlmaHJF0JTg0p2r5_kZ9KLXuJlVI9M,3075
|
8
|
+
playwrightcapture-1.28.4.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
9
|
+
playwrightcapture-1.28.4.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=UbZ8kA_JLoWdu2-vxS0-ScV41DEp-WyTk1bYRY5YANA,81515
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.28.2.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.28.2.dist-info/METADATA,sha256=a_Jpi4OyZcmYkV82L-cHdPTuC_woBOlSnj-VTorJZKI,3075
|
8
|
-
playwrightcapture-1.28.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
9
|
-
playwrightcapture-1.28.2.dist-info/RECORD,,
|
File without changes
|