PlaywrightCapture 1.28.3__py3-none-any.whl → 1.28.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +54 -35
- {playwrightcapture-1.28.3.dist-info → playwrightcapture-1.28.5.dist-info}/METADATA +2 -2
- playwrightcapture-1.28.5.dist-info/RECORD +9 -0
- {playwrightcapture-1.28.3.dist-info → playwrightcapture-1.28.5.dist-info}/WHEEL +1 -1
- playwrightcapture-1.28.3.dist-info/RECORD +0 -9
- {playwrightcapture-1.28.3.dist-info → playwrightcapture-1.28.5.dist-info}/LICENSE +0 -0
playwrightcapture/capture.py
CHANGED
@@ -159,8 +159,9 @@ class Capture():
|
|
159
159
|
master_logger = logging.getLogger('playwrightcapture')
|
160
160
|
master_logger.setLevel(loglevel)
|
161
161
|
self.logger: Logger | PlaywrightCaptureLogAdapter
|
162
|
-
|
163
|
-
|
162
|
+
self.uuid = uuid
|
163
|
+
if self.uuid is not None:
|
164
|
+
self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': self.uuid})
|
164
165
|
else:
|
165
166
|
self.logger = master_logger
|
166
167
|
self.browser_name: BROWSER = browser if browser else 'chromium'
|
@@ -736,7 +737,7 @@ class Capture():
|
|
736
737
|
got_button: bool = False
|
737
738
|
try:
|
738
739
|
try:
|
739
|
-
async with timeout(
|
740
|
+
async with timeout(3):
|
740
741
|
if await frame.locator("button.button__acceptAll").is_visible():
|
741
742
|
self.logger.info('Consent window found, clicking through.')
|
742
743
|
got_button = True
|
@@ -746,7 +747,7 @@ class Capture():
|
|
746
747
|
|
747
748
|
for label in labels_to_click:
|
748
749
|
try:
|
749
|
-
async with timeout(
|
750
|
+
async with timeout(3):
|
750
751
|
if await frame.get_by_label(label).is_visible():
|
751
752
|
got_button = True
|
752
753
|
self.logger.debug(f'Got button by label on frame: {label}')
|
@@ -756,7 +757,7 @@ class Capture():
|
|
756
757
|
self.logger.warning(f'Consent timeout (label {label}) : {e}')
|
757
758
|
|
758
759
|
try:
|
759
|
-
async with timeout(
|
760
|
+
async with timeout(3):
|
760
761
|
if await frame.get_by_role("button", name=label).is_visible():
|
761
762
|
got_button = True
|
762
763
|
self.logger.debug(f'Got button by role on frame: {label}')
|
@@ -780,7 +781,15 @@ class Capture():
|
|
780
781
|
except Exception as e:
|
781
782
|
self.logger.info(f'Error while moving time forward: {e}')
|
782
783
|
|
783
|
-
async def __instrumentation(self, page: Page, url: str, allow_tracking: bool
|
784
|
+
async def __instrumentation(self, page: Page, url: str, allow_tracking: bool) -> None:
|
785
|
+
try:
|
786
|
+
# NOTE: the clock must be installed after the page is loaded, otherwise it sometimes cause the complete capture to hang.
|
787
|
+
await page.clock.install()
|
788
|
+
clock_set = True
|
789
|
+
except Error as e:
|
790
|
+
self.logger.warning(f'Unable to install the clock: {e}')
|
791
|
+
clock_set = False
|
792
|
+
|
784
793
|
# page instrumentation
|
785
794
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
786
795
|
self.logger.debug('Start instrumentation.')
|
@@ -923,7 +932,6 @@ class Capture():
|
|
923
932
|
with_screenshot: bool=True,
|
924
933
|
with_favicon: bool=False,
|
925
934
|
allow_tracking: bool=False,
|
926
|
-
clock_set: bool=False
|
927
935
|
) -> CaptureResponse:
|
928
936
|
|
929
937
|
to_return: CaptureResponse = {}
|
@@ -991,13 +999,6 @@ class Capture():
|
|
991
999
|
self.should_retry = True
|
992
1000
|
return to_return
|
993
1001
|
|
994
|
-
try:
|
995
|
-
await page.clock.install()
|
996
|
-
clock_set = True
|
997
|
-
except Error as e:
|
998
|
-
self.logger.warning(f'Unable to install the clock: {e}')
|
999
|
-
clock_set = False
|
1000
|
-
|
1001
1002
|
if allow_tracking:
|
1002
1003
|
# Add authorization clickthroughs
|
1003
1004
|
await self.__dialog_didomi_clickthrough(page)
|
@@ -1020,8 +1021,8 @@ class Capture():
|
|
1020
1021
|
|
1021
1022
|
try:
|
1022
1023
|
try:
|
1023
|
-
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
1024
1024
|
page.on("download", handle_download)
|
1025
|
+
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
1025
1026
|
except Error as initial_error:
|
1026
1027
|
self._update_exceptions(initial_error)
|
1027
1028
|
# So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
|
@@ -1066,27 +1067,13 @@ class Capture():
|
|
1066
1067
|
|
1067
1068
|
try:
|
1068
1069
|
if self.headless:
|
1069
|
-
await self.__instrumentation(page, url, allow_tracking
|
1070
|
+
await self.__instrumentation(page, url, allow_tracking)
|
1070
1071
|
else:
|
1071
1072
|
self.logger.debug('Headed mode, skipping instrumentation.')
|
1072
1073
|
await self._wait_for_random_timeout(page, self._capture_timeout - 5)
|
1073
1074
|
except Exception as e:
|
1074
1075
|
self.logger.exception(f'Error during instrumentation: {e}')
|
1075
1076
|
|
1076
|
-
if multiple_downloads:
|
1077
|
-
if len(multiple_downloads) == 1:
|
1078
|
-
to_return["downloaded_filename"] = multiple_downloads[0][0]
|
1079
|
-
to_return["downloaded_file"] = multiple_downloads[0][1]
|
1080
|
-
else:
|
1081
|
-
# we have multiple downloads, making it a zip
|
1082
|
-
mem_zip = BytesIO()
|
1083
|
-
to_return["downloaded_filename"] = 'multiple_downloads.zip'
|
1084
|
-
with ZipFile(mem_zip, 'w') as z:
|
1085
|
-
for i, f_details in enumerate(multiple_downloads):
|
1086
|
-
filename, file_content = f_details
|
1087
|
-
z.writestr(f'{i}_{filename}', file_content)
|
1088
|
-
to_return["downloaded_file"] = mem_zip.getvalue()
|
1089
|
-
|
1090
1077
|
if content := await self._failsafe_get_content(page):
|
1091
1078
|
to_return['html'] = content
|
1092
1079
|
|
@@ -1134,7 +1121,7 @@ class Capture():
|
|
1134
1121
|
page=page, depth=depth,
|
1135
1122
|
rendered_hostname_only=rendered_hostname_only,
|
1136
1123
|
max_depth_capture_time=max_capture_time,
|
1137
|
-
|
1124
|
+
with_screenshot=with_screenshot)
|
1138
1125
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
1139
1126
|
except (TimeoutError, asyncio.TimeoutError):
|
1140
1127
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
@@ -1199,13 +1186,27 @@ class Capture():
|
|
1199
1186
|
finally:
|
1200
1187
|
self.logger.debug('Finishing up capture.')
|
1201
1188
|
if not capturing_sub:
|
1189
|
+
if multiple_downloads:
|
1190
|
+
if len(multiple_downloads) == 1:
|
1191
|
+
to_return["downloaded_filename"] = multiple_downloads[0][0]
|
1192
|
+
to_return["downloaded_file"] = multiple_downloads[0][1]
|
1193
|
+
else:
|
1194
|
+
# we have multiple downloads, making it a zip, make sure the filename is unique
|
1195
|
+
mem_zip = BytesIO()
|
1196
|
+
to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip'
|
1197
|
+
with ZipFile(mem_zip, 'w') as z:
|
1198
|
+
for i, f_details in enumerate(multiple_downloads):
|
1199
|
+
filename, file_content = f_details
|
1200
|
+
z.writestr(f'{i}_{filename}', file_content)
|
1201
|
+
to_return["downloaded_file"] = mem_zip.getvalue()
|
1202
|
+
|
1202
1203
|
try:
|
1203
|
-
to_return['storage'] = await self.
|
1204
|
-
to_return['cookies'] = await self.
|
1205
|
-
self.logger.debug('Done with cookies.')
|
1204
|
+
to_return['storage'] = await self._failsafe_get_storage()
|
1205
|
+
to_return['cookies'] = await self._failsafe_get_cookies()
|
1206
|
+
self.logger.debug('Done with cookies and storage.')
|
1206
1207
|
except Exception as e:
|
1207
1208
|
if 'error' not in to_return:
|
1208
|
-
to_return['error'] = f'Unable to get the
|
1209
|
+
to_return['error'] = f'Unable to get the storage: {e}'
|
1209
1210
|
# frames_tree = self.make_frame_tree(page.main_frame)
|
1210
1211
|
try:
|
1211
1212
|
async with timeout(60):
|
@@ -1227,6 +1228,24 @@ class Capture():
|
|
1227
1228
|
self.logger.debug('Capture done')
|
1228
1229
|
return to_return
|
1229
1230
|
|
1231
|
+
async def _failsafe_get_cookies(self) -> list[Cookie] | None:
|
1232
|
+
try:
|
1233
|
+
async with timeout(15):
|
1234
|
+
return await self.context.cookies()
|
1235
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1236
|
+
self.logger.warning("Unable to get cookies (timeout).")
|
1237
|
+
return None
|
1238
|
+
|
1239
|
+
async def _failsafe_get_storage(self) -> StorageState | None:
|
1240
|
+
try:
|
1241
|
+
async with timeout(15):
|
1242
|
+
return await self.context.storage_state(indexed_db=True)
|
1243
|
+
except (TimeoutError, asyncio.TimeoutError):
|
1244
|
+
self.logger.warning("Unable to get storage (timeout).")
|
1245
|
+
except Error as e:
|
1246
|
+
self.logger.warning(f"Unable to get storage: {e}")
|
1247
|
+
return None
|
1248
|
+
|
1230
1249
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
1231
1250
|
self.logger.debug("Capturing a screenshot of the full page.")
|
1232
1251
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.28.
|
3
|
+
Version: 1.28.5
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -20,7 +20,7 @@ Classifier: Topic :: Security
|
|
20
20
|
Provides-Extra: recaptcha
|
21
21
|
Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
|
22
22
|
Requires-Dist: aiohttp-socks (>=0.10.1)
|
23
|
-
Requires-Dist: aiohttp[speedups] (>=3.11.
|
23
|
+
Requires-Dist: aiohttp[speedups] (>=3.11.16)
|
24
24
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
25
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
|
26
26
|
Requires-Dist: dateparser (>=1.2.1)
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=nsqlSSS-GhTVRMxGgeom-rnSwV7WMMvsAWU6vnvQO_k,82469
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.28.5.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.28.5.dist-info/METADATA,sha256=fWObVPNLr1bws1WbdiAhVzYqCMV5x9Izw95MIzL7SDw,3075
|
8
|
+
playwrightcapture-1.28.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
9
|
+
playwrightcapture-1.28.5.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=ep4zmE0HhV74Cr2iGWq16obQzkIg17wTiGHkEnq6YBc,81644
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.28.3.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.28.3.dist-info/METADATA,sha256=lIke_K-KyemmKzUJ12uW0tz_0IwoWF9_KvIPJZGDP7k,3075
|
8
|
-
playwrightcapture-1.28.3.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
9
|
-
playwrightcapture-1.28.3.dist-info/RECORD,,
|
File without changes
|