PyPI - PlaywrightCapture - Versions diffs - 1.28.3__tar.gz → 1.28.5__tar.gz - Mend

PlaywrightCapture 1.28.3tar.gz → 1.28.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: PlaywrightCapture
-Version: 1.28.3
+Version: 1.28.5
 Summary: A simple library to capture websites using playwright
 License: BSD-3-Clause
 Author: Raphaël Vinot
@@ -20,7 +20,7 @@ Classifier: Topic :: Security
 Provides-Extra: recaptcha
 Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
 Requires-Dist: aiohttp-socks (>=0.10.1)
-Requires-Dist: aiohttp[speedups] (>=3.11.14)
+Requires-Dist: aiohttp[speedups] (>=3.11.16)
 Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
 Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
 Requires-Dist: dateparser (>=1.2.1)

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/playwrightcapture/capture.py RENAMED Viewed

@@ -159,8 +159,9 @@ class Capture():
         master_logger = logging.getLogger('playwrightcapture')
         master_logger.setLevel(loglevel)
         self.logger: Logger | PlaywrightCaptureLogAdapter
-        if uuid is not None:
-            self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
+        self.uuid = uuid
+        if self.uuid is not None:
+            self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': self.uuid})
         else:
             self.logger = master_logger
         self.browser_name: BROWSER = browser if browser else 'chromium'
@@ -736,7 +737,7 @@ class Capture():
         got_button: bool = False
         try:
             try:
-                async with timeout(5):
+                async with timeout(3):
                     if await frame.locator("button.button__acceptAll").is_visible():
                         self.logger.info('Consent window found, clicking through.')
                         got_button = True
@@ -746,7 +747,7 @@ class Capture():
             for label in labels_to_click:
                 try:
-                    async with timeout(5):
+                    async with timeout(3):
                         if await frame.get_by_label(label).is_visible():
                             got_button = True
                             self.logger.debug(f'Got button by label on frame: {label}')
@@ -756,7 +757,7 @@ class Capture():
                     self.logger.warning(f'Consent timeout (label {label}) : {e}')
                 try:
-                    async with timeout(5):
+                    async with timeout(3):
                         if await frame.get_by_role("button", name=label).is_visible():
                             got_button = True
                             self.logger.debug(f'Got button by role on frame: {label}')
@@ -780,7 +781,15 @@ class Capture():
         except Exception as e:
             self.logger.info(f'Error while moving time forward: {e}')
-    async def __instrumentation(self, page: Page, url: str, allow_tracking: bool, clock_set: bool) -> None:
+    async def __instrumentation(self, page: Page, url: str, allow_tracking: bool) -> None:
+        try:
+            # NOTE: the clock must be installed after the page is loaded, otherwise it sometimes cause the complete capture to hang.
+            await page.clock.install()
+            clock_set = True
+        except Error as e:
+            self.logger.warning(f'Unable to install the clock: {e}')
+            clock_set = False
         # page instrumentation
         await self._wait_for_random_timeout(page, 5)  # Wait 5 sec after document loaded
         self.logger.debug('Start instrumentation.')
@@ -923,7 +932,6 @@ class Capture():
                            with_screenshot: bool=True,
                            with_favicon: bool=False,
                            allow_tracking: bool=False,
-                           clock_set: bool=False
                            ) -> CaptureResponse:
         to_return: CaptureResponse = {}
@@ -991,13 +999,6 @@ class Capture():
                 self.should_retry = True
                 return to_return
-            try:
-                await page.clock.install()
-                clock_set = True
-            except Error as e:
-                self.logger.warning(f'Unable to install the clock: {e}')
-                clock_set = False
             if allow_tracking:
                 # Add authorization clickthroughs
                 await self.__dialog_didomi_clickthrough(page)
@@ -1020,8 +1021,8 @@ class Capture():
         try:
             try:
-                await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
                 page.on("download", handle_download)
+                await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
             except Error as initial_error:
                 self._update_exceptions(initial_error)
                 # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -1066,27 +1067,13 @@ class Capture():
                 try:
                     if self.headless:
-                        await self.__instrumentation(page, url, allow_tracking, clock_set)
+                        await self.__instrumentation(page, url, allow_tracking)
                     else:
                         self.logger.debug('Headed mode, skipping instrumentation.')
                         await self._wait_for_random_timeout(page, self._capture_timeout - 5)
                 except Exception as e:
                     self.logger.exception(f'Error during instrumentation: {e}')
-                if multiple_downloads:
-                    if len(multiple_downloads) == 1:
-                        to_return["downloaded_filename"] = multiple_downloads[0][0]
-                        to_return["downloaded_file"] = multiple_downloads[0][1]
-                    else:
-                        # we have multiple downloads, making it a zip
-                        mem_zip = BytesIO()
-                        to_return["downloaded_filename"] = 'multiple_downloads.zip'
-                        with ZipFile(mem_zip, 'w') as z:
-                            for i, f_details in enumerate(multiple_downloads):
-                                filename, file_content = f_details
-                                z.writestr(f'{i}_{filename}', file_content)
-                        to_return["downloaded_file"] = mem_zip.getvalue()
                 if content := await self._failsafe_get_content(page):
                     to_return['html'] = content
@@ -1134,7 +1121,7 @@ class Capture():
                                         page=page, depth=depth,
                                         rendered_hostname_only=rendered_hostname_only,
                                         max_depth_capture_time=max_capture_time,
-                                        clock_set=clock_set, with_screenshot=with_screenshot)
+                                        with_screenshot=with_screenshot)
                                     to_return['children'].append(child_capture)  # type: ignore[union-attr]
                             except (TimeoutError, asyncio.TimeoutError):
                                 self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
@@ -1199,13 +1186,27 @@ class Capture():
         finally:
             self.logger.debug('Finishing up capture.')
             if not capturing_sub:
+                if multiple_downloads:
+                    if len(multiple_downloads) == 1:
+                        to_return["downloaded_filename"] = multiple_downloads[0][0]
+                        to_return["downloaded_file"] = multiple_downloads[0][1]
+                    else:
+                        # we have multiple downloads, making it a zip, make sure the filename is unique
+                        mem_zip = BytesIO()
+                        to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip'
+                        with ZipFile(mem_zip, 'w') as z:
+                            for i, f_details in enumerate(multiple_downloads):
+                                filename, file_content = f_details
+                                z.writestr(f'{i}_{filename}', file_content)
+                        to_return["downloaded_file"] = mem_zip.getvalue()
                 try:
-                    to_return['storage'] = await self.context.storage_state(indexed_db=True)
-                    to_return['cookies'] = await self.context.cookies()
-                    self.logger.debug('Done with cookies.')
+                    to_return['storage'] = await self._failsafe_get_storage()
+                    to_return['cookies'] = await self._failsafe_get_cookies()
+                    self.logger.debug('Done with cookies and storage.')
                 except Exception as e:
                     if 'error' not in to_return:
-                        to_return['error'] = f'Unable to get the cookies: {e}'
+                        to_return['error'] = f'Unable to get the storage: {e}'
                 # frames_tree = self.make_frame_tree(page.main_frame)
                 try:
                     async with timeout(60):
@@ -1227,6 +1228,24 @@ class Capture():
         self.logger.debug('Capture done')
         return to_return
+    async def _failsafe_get_cookies(self) -> list[Cookie] | None:
+        try:
+            async with timeout(15):
+                return await self.context.cookies()
+        except (TimeoutError, asyncio.TimeoutError):
+            self.logger.warning("Unable to get cookies (timeout).")
+        return None
+    async def _failsafe_get_storage(self) -> StorageState | None:
+        try:
+            async with timeout(15):
+                return await self.context.storage_state(indexed_db=True)
+        except (TimeoutError, asyncio.TimeoutError):
+            self.logger.warning("Unable to get storage (timeout).")
+        except Error as e:
+            self.logger.warning(f"Unable to get storage: {e}")
+        return None
     async def _failsafe_get_screenshot(self, page: Page) -> bytes:
         self.logger.debug("Capturing a screenshot of the full page.")
         try:

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "PlaywrightCapture"
-version = "1.28.3"
+version = "1.28.5"
 description = "A simple library to capture websites using playwright"
 authors = [
     {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -21,7 +21,7 @@ dependencies = [
     "setuptools (>=78.1.0)",
     "puremagic (>=1.28)",
     "async-timeout (>=5.0.1) ; python_version < \"3.11\"",
-    "aiohttp[speedups] (>=3.11.14)",
+    "aiohttp[speedups] (>=3.11.16)",
     "aiohttp-socks (>=0.10.1)",
     "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
 ]

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.3 → playwrightcapture-1.28.5}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.28.3__tar.gz → 1.28.5__tar.gz

PlaywrightCapture 1.28.3tar.gz → 1.28.5tar.gz